665 files changed, 82199 insertions, 9167 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 22f7ccd58d38..0f628041e3f7 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -460,8 +460,10 @@ static int __init init_v9fs(void)
 
 	ret = v9fs_mux_global_init();
 	if (!ret)
-		ret = register_filesystem(&v9fs_fs_type);
-
+		return ret;
+	ret = register_filesystem(&v9fs_fs_type);
+	if (!ret)
+		v9fs_mux_global_exit();
 	return ret;
 }
 
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index eae50c9d6dc4..5241c600ce28 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -204,7 +204,6 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 		inode->i_mode = mode;
 		inode->i_uid = current->fsuid;
 		inode->i_gid = current->fsgid;
-		inode->i_blksize = sb->s_blocksize;
 		inode->i_blocks = 0;
 		inode->i_rdev = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -234,7 +233,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 			inode->i_op = &v9fs_symlink_inode_operations;
 			break;
 		case S_IFDIR:
-			inode->i_nlink++;
+			inc_nlink(inode);
 			if(v9ses->extended)
 				inode->i_op = &v9fs_dir_inode_operations_ext;
 			else
@@ -950,9 +949,8 @@ v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode,
 
 	inode->i_size = stat->length;
 
-	inode->i_blksize = sb->s_blocksize;
 	inode->i_blocks =
-	    (inode->i_size + inode->i_blksize - 1) >> sb->s_blocksize_bits;
+	    (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
 }
 
 /**
diff --git a/fs/Kconfig b/fs/Kconfig
index a27002668bd3..fee318e6f4bb 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -4,6 +4,8 @@
 
 menu "File systems"
 
+if BLOCK
+
 config EXT2_FS
 	tristate "Second extended fs support"
 	help
@@ -72,11 +74,11 @@ config EXT3_FS
 	tristate "Ext3 journalling file system support"
 	select JBD
 	help
-	  This is the journaling version of the Second extended file system
+	  This is the journalling version of the Second extended file system
 	  (often called ext3), the de facto standard Linux file system
 	  (method to organize files on a storage device) for hard disks.
 
-	  The journaling code included in this driver means you do not have
+	  The journalling code included in this driver means you do not have
 	  to run e2fsck (file system checker) on your file systems after a
 	  crash.  The journal keeps track of any changes that were being made
 	  at the time the system crashed, and can ensure that your file system
@@ -138,10 +140,77 @@ config EXT3_FS_SECURITY
 	  If you are not using a security module that requires using
 	  extended attributes for file security labels, say N.
 
+config EXT4DEV_FS
+	tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	select JBD2
+	help
+	  Ext4dev is a predecessor filesystem of the next generation
+	  extended fs ext4, based on ext3 filesystem code. It will be
+	  renamed ext4 fs later, once ext4dev is mature and stabilized.
+
+	  Unlike the change from ext2 filesystem to ext3 filesystem,
+	  the on-disk format of ext4dev is not the same as ext3 any more:
+	  it is based on extent maps and it supports 48-bit physical block
+	  numbers. These combined on-disk format changes will allow
+	  ext4dev/ext4 to handle more than 16 TB filesystem volumes --
+	  a hard limit that ext3 cannot overcome without changing the
+	  on-disk format.
+
+	  Other than extent maps and 48-bit block numbers, ext4dev also is
+	  likely to have other new features such as persistent preallocation,
+	  high resolution time stamps, and larger file support etc.  These
+	  features will be added to ext4dev gradually.
+
+	  To compile this file system support as a module, choose M here. The
+	  module will be called ext4dev.  Be aware, however, that the filesystem
+	  of your root partition (the one containing the directory /) cannot
+	  be compiled as a module, and so this could be dangerous.
+
+	  If unsure, say N.
+
+config EXT4DEV_FS_XATTR
+	bool "Ext4dev extended attributes"
+	depends on EXT4DEV_FS
+	default y
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+	  You need this for POSIX ACL support on ext4dev/ext4.
+
+config EXT4DEV_FS_POSIX_ACL
+	bool "Ext4dev POSIX Access Control Lists"
+	depends on EXT4DEV_FS_XATTR
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config EXT4DEV_FS_SECURITY
+	bool "Ext4dev Security Labels"
+	depends on EXT4DEV_FS_XATTR
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the ext4dev/ext4 filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
 config JBD
 	tristate
 	help
-	  This is a generic journaling layer for block devices.  It is
+	  This is a generic journalling layer for block devices.  It is
 	  currently used by the ext3 and OCFS2 file systems, but it could
 	  also be used to add journal support to other file systems or block
 	  devices such as RAID or LVM.
@@ -170,18 +239,50 @@ config JBD_DEBUG
 	  generated.  To turn debugging off again, do
 	  "echo 0 > /proc/sys/fs/jbd-debug".
 
+config JBD2
+	tristate
+	help
+	  This is a generic journaling layer for block devices that support
+	  both 32-bit and 64-bit block numbers.  It is currently used by
+	  the ext4dev/ext4 filesystem, but it could also be used to add
+	  journal support to other file systems or block devices such
+	  as RAID or LVM.
+
+	  If you are using ext4dev/ext4, you need to say Y here. If you are not
+	  using ext4dev/ext4 then you will probably want to say N.
+
+	  To compile this device as a module, choose M here. The module will be
+	  called jbd2.  If you are compiling ext4dev/ext4 into the kernel,
+	  you cannot compile this code as a module.
+
+config JBD2_DEBUG
+	bool "JBD2 (ext4dev/ext4) debugging support"
+	depends on JBD2
+	help
+	  If you are using the ext4dev/ext4 journaled file system (or
+	  potentially any other filesystem/device using JBD2), this option
+	  allows you to enable debugging output while the system is running,
+	  in order to help track down any problems you are having.
+	  By default, the debugging output will be turned off.
+
+	  If you select Y here, then you will be able to turn on debugging
+	  with "echo N > /proc/sys/fs/jbd2-debug", where N is a number between
+	  1 and 5. The higher the number, the more debugging output is
+	  generated.  To turn debugging off again, do
+	  "echo 0 > /proc/sys/fs/jbd2-debug".
+
 config FS_MBCACHE
-# Meta block cache for Extended Attributes (ext2/ext3)
+# Meta block cache for Extended Attributes (ext2/ext3/ext4)
 	tristate
-	depends on EXT2_FS_XATTR || EXT3_FS_XATTR
-	default y if EXT2_FS=y || EXT3_FS=y
-	default m if EXT2_FS=m || EXT3_FS=m
+	depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4DEV_FS_XATTR
+	default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
+	default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
 
 config REISERFS_FS
 	tristate "Reiserfs support"
 	help
 	  Stores not just filenames but the files themselves in a balanced
-	  tree.  Uses journaling.
+	  tree.  Uses journalling.
 
 	  Balanced trees are more efficient than traditional file system
 	  architectural foundations.
@@ -323,6 +424,7 @@ config FS_POSIX_ACL
 	default n
 
 source "fs/xfs/Kconfig"
+source "fs/gfs2/Kconfig"
 
 config OCFS2_FS
 	tristate "OCFS2 file system support"
@@ -399,6 +501,8 @@ config ROMFS_FS
 	  If you don't know whether you need it, then you don't need it:
 	  answer N.
 
+endif
+
 config INOTIFY
 	bool "Inotify file change notification support"
 	default y
@@ -530,6 +634,11 @@ config FUSE_FS
 	  If you want to develop a userspace FS, or if you want to use
 	  a filesystem based on FUSE, answer Y or M.
 
+config GENERIC_ACL
+	bool
+	select FS_POSIX_ACL
+
+if BLOCK
 menu "CD-ROM/DVD Filesystems"
 
 config ISO9660_FS
@@ -597,7 +706,9 @@ config UDF_NLS
 	depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y)
 
 endmenu
+endif
 
+if BLOCK
 menu "DOS/FAT/NT Filesystems"
 
 config FAT_FS
@@ -782,6 +893,7 @@ config NTFS_RW
 	  It is perfectly safe to say N here.
 
 endmenu
+endif
 
 menu "Pseudo filesystems"
 
@@ -826,6 +938,25 @@ config PROC_VMCORE
         help
         Exports the dump image of crashed kernel in ELF format.
 
+config PROC_SYSCTL
+	bool "Sysctl support (/proc/sys)" if EMBEDDED
+	depends on PROC_FS
+	select SYSCTL
+	default y
+	---help---
+	  The sysctl interface provides a means of dynamically changing
+	  certain kernel parameters and variables on the fly without requiring
+	  a recompile of the kernel or reboot of the system.  The primary
+	  interface is through /proc/sys.  If you say Y here a tree of
+	  modifiable sysctl entries will be generated beneath the
+          /proc/sys directory. They are explained in the files
+	  in <file:Documentation/sysctl/>.  Note that enabling this
+	  option will enlarge the kernel by at least 8 KB.
+
+	  As it is generally a good thing, you should say Y here unless
+	  building a kernel for install/rescue disks or your system is very
+	  limited in memory.
+
 config SYSFS
 	bool "sysfs file system support" if EMBEDDED
 	default y
@@ -862,6 +993,19 @@ config TMPFS
 
 	  See <file:Documentation/filesystems/tmpfs.txt> for details.
 
+config TMPFS_POSIX_ACL
+	bool "Tmpfs POSIX Access Control Lists"
+	depends on TMPFS
+	select GENERIC_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N.
+
 config HUGETLBFS
 	bool "HugeTLB file system support"
 	depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN
@@ -907,7 +1051,7 @@ menu "Miscellaneous filesystems"
 
 config ADFS_FS
 	tristate "ADFS file system support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on BLOCK && EXPERIMENTAL
 	help
 	  The Acorn Disc Filing System is the standard file system of the
 	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
@@ -935,7 +1079,7 @@ config ADFS_FS_RW
 
 config AFFS_FS
 	tristate "Amiga FFS file system support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on BLOCK && EXPERIMENTAL
 	help
 	  The Fast File System (FFS) is the common file system used on hard
 	  disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20).  Say Y
@@ -955,9 +1099,21 @@ config AFFS_FS
 	  To compile this file system support as a module, choose M here: the
 	  module will be called affs.  If unsure, say N.
 
+config ECRYPT_FS
+	tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && KEYS && CRYPTO
+	help
+	  Encrypted filesystem that operates on the VFS layer.  See
+	  <file:Documentation/ecryptfs.txt> to learn more about
+	  eCryptfs.  Userspace components are required and can be
+	  obtained from <http://ecryptfs.sf.net>.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called ecryptfs.
+
 config HFS_FS
 	tristate "Apple Macintosh file system support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on BLOCK && EXPERIMENTAL
 	select NLS
 	help
 	  If you say Y here, you will be able to mount Macintosh-formatted
@@ -970,6 +1126,7 @@ config HFS_FS
 
 config HFSPLUS_FS
 	tristate "Apple Extended HFS file system support"
+	depends on BLOCK
 	select NLS
 	select NLS_UTF8
 	help
@@ -983,7 +1140,7 @@ config HFSPLUS_FS
 
 config BEFS_FS
 	tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on BLOCK && EXPERIMENTAL
 	select NLS
 	help
 	  The BeOS File System (BeFS) is the native file system of Be, Inc's
@@ -991,7 +1148,7 @@ config BEFS_FS
 	  on files and directories, and database-like indeces on selected
 	  attributes. (Also note that this driver doesn't make those features
 	  available at this time). It is a 64 bit filesystem, so it supports
-	  extremly large volumes and files.
+	  extremely large volumes and files.
 
 	  If you use this filesystem, you should also say Y to at least one
 	  of the NLS (native language support) options below.
@@ -1010,7 +1167,7 @@ config BEFS_DEBUG
 
 config BFS_FS
 	tristate "BFS file system support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on BLOCK && EXPERIMENTAL
 	help
 	  Boot File System (BFS) is a file system used under SCO UnixWare to
 	  allow the bootloader access to the kernel image and other important
@@ -1032,7 +1189,7 @@ config BFS_FS
 
 config EFS_FS
 	tristate "EFS file system support (read only) (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on BLOCK && EXPERIMENTAL
 	help
 	  EFS is an older file system used for non-ISO9660 CD-ROMs and hard
 	  disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
@@ -1047,9 +1204,9 @@ config EFS_FS
 
 config JFFS_FS
 	tristate "Journalling Flash File System (JFFS) support"
-	depends on MTD
+	depends on MTD && BLOCK
 	help
-	  JFFS is the Journaling Flash File System developed by Axis
+	  JFFS is the Journalling Flash File System developed by Axis
 	  Communications in Sweden, aimed at providing a crash/powerdown-safe
 	  file system for disk-less embedded devices. Further information is
 	  available at (<http://developer.axis.com/software/jffs/>).
@@ -1219,7 +1376,7 @@ config JFFS2_CMODE_NONE
 config JFFS2_CMODE_PRIORITY
         bool "priority"
         help
-          Tries the compressors in a predefinied order and chooses the first
+          Tries the compressors in a predefined order and chooses the first
           successful one.
 
 config JFFS2_CMODE_SIZE
@@ -1232,6 +1389,7 @@ endchoice
 
 config CRAMFS
 	tristate "Compressed ROM file system support (cramfs)"
+	depends on BLOCK
 	select ZLIB_INFLATE
 	help
 	  Saying Y here includes support for CramFs (Compressed ROM File
@@ -1251,6 +1409,7 @@ config CRAMFS
 
 config VXFS_FS
 	tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
+	depends on BLOCK
 	help
 	  FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
 	  file system format.  VERITAS VxFS(TM) is the standard file system
@@ -1268,6 +1427,7 @@ config VXFS_FS
 
 config HPFS_FS
 	tristate "OS/2 HPFS file system support"
+	depends on BLOCK
 	help
 	  OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
 	  is the file system used for organizing files on OS/2 hard disk
@@ -1284,6 +1444,7 @@ config HPFS_FS
 
 config QNX4FS_FS
 	tristate "QNX4 file system support (read only)"
+	depends on BLOCK
 	help
 	  This is the file system used by the real-time operating systems
 	  QNX 4 and QNX 6 (the latter is also called QNX RTP).
@@ -1311,6 +1472,7 @@ config QNX4FS_RW
 
 config SYSV_FS
 	tristate "System V/Xenix/V7/Coherent file system support"
+	depends on BLOCK
 	help
 	  SCO, Xenix and Coherent are commercial Unix systems for Intel
 	  machines, and Version 7 was used on the DEC PDP-11. Saying Y
@@ -1319,7 +1481,7 @@ config SYSV_FS
 
 	  If you have floppies or hard disk partitions like that, it is likely
 	  that they contain binaries from those other Unix systems; in order
-	  to run these binaries, you will want to install linux-abi which is a
+	  to run these binaries, you will want to install linux-abi which is
 	  a set of kernel modules that lets you run SCO, Xenix, Wyse,
 	  UnixWare, Dell Unix and System V programs under Linux.  It is
 	  available via FTP (user: ftp) from
@@ -1349,6 +1511,7 @@ config SYSV_FS
 
 config UFS_FS
 	tristate "UFS file system support (read only)"
+	depends on BLOCK
 	help
 	  BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
 	  OpenBSD and NeXTstep) use a file system called UFS. Some System V
@@ -1827,7 +1990,7 @@ config CIFS_EXPERIMENTAL
 config CIFS_UPCALL
 	  bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)"
 	  depends on CIFS_EXPERIMENTAL
-	  select CONNECTOR
+	  depends on CONNECTOR
 	  help
 	    Enables an upcall mechanism for CIFS which will be used to contact
 	    userspace helper utilities to provide SPNEGO packaged Kerberos
@@ -1903,7 +2066,7 @@ config AFS_FS
 	  If you say Y here, you will get an experimental Andrew File System
 	  driver. It currently only supports unsecured read-only AFS access.
 
-	  See <file:Documentation/filesystems/afs.txt> for more intormation.
+	  See <file:Documentation/filesystems/afs.txt> for more information.
 
 	  If unsure, say N.
 
@@ -1923,13 +2086,16 @@ config 9P_FS
 
 endmenu
 
+if BLOCK
 menu "Partition Types"
 
 source "fs/partitions/Kconfig"
 
 endmenu
+endif
 
 source "fs/nls/Kconfig"
+source "fs/dlm/Kconfig"
 
 endmenu
 
diff --git a/fs/Makefile b/fs/Makefile
index 89135428a539..9a5ce9323bfd 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -5,12 +5,18 @@
 # Rewritten to use lists instead of if-statements.
 # 
 
-obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
-		block_dev.o char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
+obj-y :=	open.o read_write.o file_table.o super.o \
+		char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
 		ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
-		seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
-		ioprio.o pnode.o drop_caches.o splice.o sync.o
+		seq_file.o xattr.o libfs.o fs-writeback.o \
+		pnode.o drop_caches.o splice.o sync.o utimes.o
+
+ifeq ($(CONFIG_BLOCK),y)
+obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
+else
+obj-y +=	no-block.o
+endif
 
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
@@ -35,6 +41,7 @@ obj-$(CONFIG_BINFMT_FLAT)	+= binfmt_flat.o
 obj-$(CONFIG_FS_MBCACHE)	+= mbcache.o
 obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o xattr_acl.o
 obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
+obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
 
 obj-$(CONFIG_QUOTA)		+= dquot.o
 obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
@@ -50,11 +57,14 @@ obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
 obj-y				+= devpts/
 
 obj-$(CONFIG_PROFILING)		+= dcookies.o
+obj-$(CONFIG_DLM)		+= dlm/
  
 # Do not add any filesystems before this line
 obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
 obj-$(CONFIG_EXT3_FS)		+= ext3/ # Before ext2 so root fs can be ext3
+obj-$(CONFIG_EXT4DEV_FS)	+= ext4/ # Before ext2 so root fs can be ext4dev
 obj-$(CONFIG_JBD)		+= jbd/
+obj-$(CONFIG_JBD2)		+= jbd2/
 obj-$(CONFIG_EXT2_FS)		+= ext2/
 obj-$(CONFIG_CRAMFS)		+= cramfs/
 obj-$(CONFIG_RAMFS)		+= ramfs/
@@ -68,6 +78,7 @@ obj-$(CONFIG_BFS_FS)		+= bfs/
 obj-$(CONFIG_ISO9660_FS)	+= isofs/
 obj-$(CONFIG_HFSPLUS_FS)	+= hfsplus/ # Before hfs to find wrapped HFS+
 obj-$(CONFIG_HFS_FS)		+= hfs/
+obj-$(CONFIG_ECRYPT_FS)		+= ecryptfs/
 obj-$(CONFIG_VXFS_FS)		+= freevxfs/
 obj-$(CONFIG_NFS_FS)		+= nfs/
 obj-$(CONFIG_EXPORTFS)		+= exportfs/
@@ -102,3 +113,4 @@ obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)		+= hppfs/
 obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
+obj-$(CONFIG_GFS2_FS)           += gfs2/
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 1014b9f2117b..6101ea679cb1 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -27,10 +27,12 @@
 
 const struct file_operations adfs_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
 	.mmap		= generic_file_mmap,
 	.fsync		= file_fsync,
-	.write		= generic_file_write,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.sendfile	= generic_file_sendfile,
 };
 
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 534f3eecc985..7e7a04be1278 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -269,7 +269,6 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
 	inode->i_ino	 = obj->file_id;
 	inode->i_size	 = obj->size;
 	inode->i_nlink	 = 2;
-	inode->i_blksize = PAGE_SIZE;
 	inode->i_blocks	 = (inode->i_size + sb->s_blocksize - 1) >>
 			    sb->s_blocksize_bits;
 
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 82011019494c..9ade139086fc 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -251,8 +251,7 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(adfs_inode_cachep))
-		printk(KERN_INFO "adfs_inode_cache: not all structures were freed\n");
+	kmem_cache_destroy(adfs_inode_cachep);
 }
 
 static struct super_operations adfs_sops = {
@@ -339,11 +338,10 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_flags |= MS_NODIRATIME;
 
-	asb = kmalloc(sizeof(*asb), GFP_KERNEL);
+	asb = kzalloc(sizeof(*asb), GFP_KERNEL);
 	if (!asb)
 		return -ENOMEM;
 	sb->s_fs_info = asb;
-	memset(asb, 0, sizeof(*asb));
 
 	/* set default options */
 	asb->s_uid = 0;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 3de8590e4f6a..05b5e22de759 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -27,8 +27,10 @@ static int affs_file_release(struct inode *inode, struct file *filp);
 
 const struct file_operations affs_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.open		= affs_file_open,
 	.release	= affs_file_release,
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 17352011ab67..5ea72c3a16c3 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -109,8 +109,7 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(affs_inode_cachep))
-		printk(KERN_INFO "affs_inode_cache: not all structures were freed\n");
+	kmem_cache_destroy(affs_inode_cachep);
 }
 
 static struct super_operations affs_sops = {
@@ -280,11 +279,10 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_op                = &affs_sops;
 	sb->s_flags |= MS_NODIRATIME;
 
-	sbi = kmalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
+	sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
 	sb->s_fs_info = sbi;
-	memset(sbi, 0, sizeof(*sbi));
 	init_MUTEX(&sbi->s_bmlock);
 
 	if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 2fc99877cb0d..a6ec75c56fcf 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -30,7 +30,7 @@ static int afs_dir_readdir(struct file *file, void *dirent, filldir_t filldir);
 static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd);
 static int afs_d_delete(struct dentry *dentry);
 static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen,
-				  loff_t fpos, ino_t ino, unsigned dtype);
+				  loff_t fpos, u64 ino, unsigned dtype);
 
 const struct file_operations afs_dir_file_operations = {
 	.open		= afs_dir_open,
@@ -211,8 +211,8 @@ static int afs_dir_open(struct inode *inode, struct file *file)
 {
 	_enter("{%lu}", inode->i_ino);
 
-	BUG_ON(sizeof(union afs_dir_block) != 2048);
-	BUG_ON(sizeof(union afs_dirent) != 32);
+	BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
+	BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
 
 	if (AFS_FS_I(inode)->flags & AFS_VNODE_DELETED)
 		return -ENOENT;
@@ -409,7 +409,7 @@ static int afs_dir_readdir(struct file *file, void *cookie, filldir_t filldir)
  *   uniquifier through dtype
  */
 static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen,
-				  loff_t fpos, ino_t ino, unsigned dtype)
+				  loff_t fpos, u64 ino, unsigned dtype)
 {
 	struct afs_dir_lookup_cookie *cookie = _cookie;
 
@@ -446,8 +446,8 @@ static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry,
 	_enter("{%lu},%p{%s}", dir->i_ino, dentry, dentry->d_name.name);
 
 	/* insanity checks first */
-	BUG_ON(sizeof(union afs_dir_block) != 2048);
-	BUG_ON(sizeof(union afs_dirent) != 32);
+	BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
+	BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
 
 	if (dentry->d_name.len > 255) {
 		_leave(" = -ENAMETOOLONG");
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 67d6634101fd..2e8c42639eaa 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -16,7 +16,6 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include <linux/buffer_head.h>
 #include "volume.h"
 #include "vnode.h"
 #include <rxrpc/call.h>
@@ -37,7 +36,6 @@ struct inode_operations afs_file_inode_operations = {
 
 const struct address_space_operations afs_fs_aops = {
 	.readpage	= afs_file_readpage,
-	.sync_page	= block_sync_page,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
 	.releasepage	= afs_file_releasepage,
 	.invalidatepage	= afs_file_invalidatepage,
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 4ebb30a50ed5..6f37754906c2 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -72,7 +72,6 @@ static int afs_inode_map_status(struct afs_vnode *vnode)
 	inode->i_ctime.tv_sec	= vnode->status.mtime_server;
 	inode->i_ctime.tv_nsec	= 0;
 	inode->i_atime		= inode->i_mtime = inode->i_ctime;
-	inode->i_blksize	= PAGE_CACHE_SIZE;
 	inode->i_blocks		= 0;
 	inode->i_version	= vnode->fid.unique;
 	inode->i_mapping->a_ops	= &afs_fs_aops;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 101d21b6c037..86463ec9ccb4 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -775,6 +775,7 @@ static int afs_proc_cell_servers_release(struct inode *inode,
  * first item
  */
 static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
+	__acquires(m->private->sv_lock)
 {
 	struct list_head *_p;
 	struct afs_cell *cell = m->private;
@@ -823,6 +824,7 @@ static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
  * clean up after reading from the cells list
  */
 static void afs_proc_cell_servers_stop(struct seq_file *p, void *v)
+	__releases(p->private->sv_lock)
 {
 	struct afs_cell *cell = p->private;
 
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 331f730a1fb3..782ee7c600ca 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -281,11 +281,10 @@ int afs_vlocation_lookup(struct afs_cell *cell,
 	spin_unlock(&cell->vl_gylock);
 
 	/* not in the cell's in-memory lists - create a new record */
-	vlocation = kmalloc(sizeof(struct afs_vlocation), GFP_KERNEL);
+	vlocation = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL);
 	if (!vlocation)
 		return -ENOMEM;
 
-	memset(vlocation, 0, sizeof(struct afs_vlocation));
 	atomic_set(&vlocation->usage, 1);
 	INIT_LIST_HEAD(&vlocation->link);
 	rwlock_init(&vlocation->lock);
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 0ff4b86476e3..768c6dbd323a 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -186,11 +186,10 @@ int afs_volume_lookup(const char *name, struct afs_cell *cell, int rwpath,
 	_debug("creating new volume record");
 
 	ret = -ENOMEM;
-	volume = kmalloc(sizeof(struct afs_volume), GFP_KERNEL);
+	volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL);
 	if (!volume)
 		goto error_up;
 
-	memset(volume, 0, sizeof(struct afs_volume));
 	atomic_set(&volume->usage, 1);
 	volume->type		= type;
 	volume->type_force	= force;
diff --git a/fs/aio.c b/fs/aio.c
index 950630187acc..94766599db00 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -15,6 +15,7 @@
 #include <linux/aio_abi.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/uio.h>
 
 #define DEBUG 0
 
@@ -414,6 +415,7 @@ static struct kiocb fastcall *__aio_get_req(struct kioctx *ctx)
 	req->ki_retry = NULL;
 	req->ki_dtor = NULL;
 	req->private = NULL;
+	req->ki_iovec = NULL;
 	INIT_LIST_HEAD(&req->ki_run_list);
 
 	/* Check if the completion queue has enough free space to
@@ -459,6 +461,8 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
 
 	if (req->ki_dtor)
 		req->ki_dtor(req);
+	if (req->ki_iovec != &req->ki_inline_vec)
+		kfree(req->ki_iovec);
 	kmem_cache_free(kiocb_cachep, req);
 	ctx->reqs_active--;
 
@@ -671,7 +675,7 @@ static ssize_t aio_run_iocb(struct kiocb *iocb)
 	}
 
 	if (!(iocb->ki_retried & 0xff)) {
-		pr_debug("%ld retry: %d of %d\n", iocb->ki_retried,
+		pr_debug("%ld retry: %zd of %zd\n", iocb->ki_retried,
 			iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes);
 	}
 
@@ -1004,7 +1008,7 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 
 	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
 
-	pr_debug("%ld retries: %d of %d\n", iocb->ki_retried,
+	pr_debug("%ld retries: %zd of %zd\n", iocb->ki_retried,
 		iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes);
 put_rq:
 	/* everything turned out well, dispose of the aiocb. */
@@ -1300,63 +1304,63 @@ asmlinkage long sys_io_destroy(aio_context_t ctx)
 	return -EINVAL;
 }
 
-/*
- * aio_p{read,write} are the default  ki_retry methods for
- * IO_CMD_P{READ,WRITE}.  They maintains kiocb retry state around potentially
- * multiple calls to f_op->aio_read().  They loop around partial progress
- * instead of returning -EIOCBRETRY because they don't have the means to call
- * kick_iocb().
- */
-static ssize_t aio_pread(struct kiocb *iocb)
+static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret)
 {
-	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	ssize_t ret = 0;
-
-	do {
-		ret = file->f_op->aio_read(iocb, iocb->ki_buf,
-			iocb->ki_left, iocb->ki_pos);
-		/*
-		 * Can't just depend on iocb->ki_left to determine
-		 * whether we are done. This may have been a short read.
-		 */
-		if (ret > 0) {
-			iocb->ki_buf += ret;
-			iocb->ki_left -= ret;
+	struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg];
+
+	BUG_ON(ret <= 0);
+
+	while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) {
+		ssize_t this = min((ssize_t)iov->iov_len, ret);
+		iov->iov_base += this;
+		iov->iov_len -= this;
+		iocb->ki_left -= this;
+		ret -= this;
+		if (iov->iov_len == 0) {
+			iocb->ki_cur_seg++;
+			iov++;
 		}
+	}
 
-		/*
-		 * For pipes and sockets we return once we have some data; for
-		 * regular files we retry till we complete the entire read or
-		 * find that we can't read any more data (e.g short reads).
-		 */
-	} while (ret > 0 && iocb->ki_left > 0 &&
-		 !S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode));
-
-	/* This means we must have transferred all that we could */
-	/* No need to retry anymore */
-	if ((ret == 0) || (iocb->ki_left == 0))
-		ret = iocb->ki_nbytes - iocb->ki_left;
-
-	return ret;
+	/* the caller should not have done more io than what fit in
+	 * the remaining iovecs */
+	BUG_ON(ret > 0 && iocb->ki_left == 0);
 }
 
-/* see aio_pread() */
-static ssize_t aio_pwrite(struct kiocb *iocb)
+static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
 {
 	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t (*rw_op)(struct kiocb *, const struct iovec *,
+			 unsigned long, loff_t);
 	ssize_t ret = 0;
+	unsigned short opcode;
+
+	if ((iocb->ki_opcode == IOCB_CMD_PREADV) ||
+		(iocb->ki_opcode == IOCB_CMD_PREAD)) {
+		rw_op = file->f_op->aio_read;
+		opcode = IOCB_CMD_PREADV;
+	} else {
+		rw_op = file->f_op->aio_write;
+		opcode = IOCB_CMD_PWRITEV;
+	}
 
 	do {
-		ret = file->f_op->aio_write(iocb, iocb->ki_buf,
-			iocb->ki_left, iocb->ki_pos);
-		if (ret > 0) {
-			iocb->ki_buf += ret;
-			iocb->ki_left -= ret;
-		}
-	} while (ret > 0 && iocb->ki_left > 0);
+		ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg],
+			    iocb->ki_nr_segs - iocb->ki_cur_seg,
+			    iocb->ki_pos);
+		if (ret > 0)
+			aio_advance_iovec(iocb, ret);
+
+	/* retry all partial writes.  retry partial reads as long as its a
+	 * regular file. */
+	} while (ret > 0 && iocb->ki_left > 0 &&
+		 (opcode == IOCB_CMD_PWRITEV ||
+		  (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));
 
+	/* This means we must have transferred all that we could */
+	/* No need to retry anymore */
 	if ((ret == 0) || (iocb->ki_left == 0))
 		ret = iocb->ki_nbytes - iocb->ki_left;
 
@@ -1383,6 +1387,38 @@ static ssize_t aio_fsync(struct kiocb *iocb)
 	return ret;
 }
 
+static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb)
+{
+	ssize_t ret;
+
+	ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf,
+				    kiocb->ki_nbytes, 1,
+				    &kiocb->ki_inline_vec, &kiocb->ki_iovec);
+	if (ret < 0)
+		goto out;
+
+	kiocb->ki_nr_segs = kiocb->ki_nbytes;
+	kiocb->ki_cur_seg = 0;
+	/* ki_nbytes/left now reflect bytes instead of segs */
+	kiocb->ki_nbytes = ret;
+	kiocb->ki_left = ret;
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
+{
+	kiocb->ki_iovec = &kiocb->ki_inline_vec;
+	kiocb->ki_iovec->iov_base = kiocb->ki_buf;
+	kiocb->ki_iovec->iov_len = kiocb->ki_left;
+	kiocb->ki_nr_segs = 1;
+	kiocb->ki_cur_seg = 0;
+	kiocb->ki_nbytes = kiocb->ki_left;
+	return 0;
+}
+
 /*
  * aio_setup_iocb:
  *	Performs the initial checks and aio retry method
@@ -1405,9 +1441,12 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
 		ret = security_file_permission(file, MAY_READ);
 		if (unlikely(ret))
 			break;
+		ret = aio_setup_single_vector(kiocb);
+		if (ret)
+			break;
 		ret = -EINVAL;
 		if (file->f_op->aio_read)
-			kiocb->ki_retry = aio_pread;
+			kiocb->ki_retry = aio_rw_vect_retry;
 		break;
 	case IOCB_CMD_PWRITE:
 		ret = -EBADF;
@@ -1420,9 +1459,40 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
 		ret = security_file_permission(file, MAY_WRITE);
 		if (unlikely(ret))
 			break;
+		ret = aio_setup_single_vector(kiocb);
+		if (ret)
+			break;
+		ret = -EINVAL;
+		if (file->f_op->aio_write)
+			kiocb->ki_retry = aio_rw_vect_retry;
+		break;
+	case IOCB_CMD_PREADV:
+		ret = -EBADF;
+		if (unlikely(!(file->f_mode & FMODE_READ)))
+			break;
+		ret = security_file_permission(file, MAY_READ);
+		if (unlikely(ret))
+			break;
+		ret = aio_setup_vectored_rw(READ, kiocb);
+		if (ret)
+			break;
+		ret = -EINVAL;
+		if (file->f_op->aio_read)
+			kiocb->ki_retry = aio_rw_vect_retry;
+		break;
+	case IOCB_CMD_PWRITEV:
+		ret = -EBADF;
+		if (unlikely(!(file->f_mode & FMODE_WRITE)))
+			break;
+		ret = security_file_permission(file, MAY_WRITE);
+		if (unlikely(ret))
+			break;
+		ret = aio_setup_vectored_rw(WRITE, kiocb);
+		if (ret)
+			break;
 		ret = -EINVAL;
 		if (file->f_op->aio_write)
-			kiocb->ki_retry = aio_pwrite;
+			kiocb->ki_retry = aio_rw_vect_retry;
 		break;
 	case IOCB_CMD_FDSYNC:
 		ret = -EINVAL;
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index c7700d9b3f96..906ba5ce2261 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -149,6 +149,7 @@ extern const struct file_operations autofs_root_operations;
 /* Initializing function */
 
 int autofs_fill_super(struct super_block *, void *, int);
+void autofs_kill_sb(struct super_block *sb);
 
 /* Queue management functions */
 
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 3fded389d06b..bf8c8af98004 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -246,5 +246,4 @@ void autofs_hash_nuke(struct autofs_sb_info *sbi)
 			kfree(ent);
 		}
 	}
-	shrink_dcache_sb(sbi->sb);
 }
diff --git a/fs/autofs/init.c b/fs/autofs/init.c
index aca123752406..cea5219b4f37 100644
--- a/fs/autofs/init.c
+++ b/fs/autofs/init.c
@@ -24,7 +24,7 @@ static struct file_system_type autofs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "autofs",
 	.get_sb		= autofs_get_sb,
-	.kill_sb	= kill_anon_super,
+	.kill_sb	= autofs_kill_sb,
 };
 
 static int __init init_autofs_fs(void)
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index af2efbbb5d76..54c518c89e4c 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -20,7 +20,7 @@
 #include "autofs_i.h"
 #include <linux/module.h>
 
-static void autofs_put_super(struct super_block *sb)
+void autofs_kill_sb(struct super_block *sb)
 {
 	struct autofs_sb_info *sbi = autofs_sbi(sb);
 	unsigned int n;
@@ -37,13 +37,13 @@ static void autofs_put_super(struct super_block *sb)
 	kfree(sb->s_fs_info);
 
 	DPRINTK(("autofs: shutting down\n"));
+	kill_anon_super(sb);
 }
 
 static void autofs_read_inode(struct inode *inode);
 
 static struct super_operations autofs_sops = {
 	.read_inode	= autofs_read_inode,
-	.put_super	= autofs_put_super,
 	.statfs		= simple_statfs,
 };
 
@@ -129,10 +129,9 @@ int autofs_fill_super(struct super_block *s, void *data, int silent)
 	struct autofs_sb_info *sbi;
 	int minproto, maxproto;
 
-	sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if ( !sbi )
 		goto fail_unlock;
-	memset(sbi, 0, sizeof(*sbi));
 	DPRINTK(("autofs: starting up, sbi = %p\n",sbi));
 
 	s->s_fs_info = sbi;
@@ -217,7 +216,6 @@ static void autofs_read_inode(struct inode *inode)
 	inode->i_nlink = 2;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_blocks = 0;
-	inode->i_blksize = 1024;
 
 	if ( ino == AUTOFS_ROOT_INO ) {
 		inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
@@ -242,7 +240,7 @@ static void autofs_read_inode(struct inode *inode)
 		
 		inode->i_op = &autofs_symlink_inode_operations;
 		sl = &sbi->symlink[n];
-		inode->u.generic_ip = sl;
+		inode->i_private = sl;
 		inode->i_mode = S_IFLNK | S_IRWXUGO;
 		inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = sl->mtime;
 		inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 9cac08d6a873..368a1c33a3c8 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -414,7 +414,7 @@ static int autofs_root_rmdir(struct inode *dir, struct dentry *dentry)
 
 	dentry->d_time = (unsigned long)(struct autofs_dir_ent *)NULL;
 	autofs_hash_delete(ent);
-	dir->i_nlink--;
+	drop_nlink(dir);
 	d_drop(dentry);
 	unlock_kernel();
 
@@ -466,7 +466,7 @@ static int autofs_root_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	ent->dentry = dentry;
 	autofs_hash_insert(dh,ent);
 
-	dir->i_nlink++;
+	inc_nlink(dir);
 	d_instantiate(dentry, iget(dir->i_sb,ino));
 	unlock_kernel();
 
diff --git a/fs/autofs/symlink.c b/fs/autofs/symlink.c
index 52e8772b066e..c74f2eb65775 100644
--- a/fs/autofs/symlink.c
+++ b/fs/autofs/symlink.c
@@ -15,7 +15,7 @@
 /* Nothing to release.. */
 static void *autofs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-	char *s=((struct autofs_symlink *)dentry->d_inode->u.generic_ip)->data;
+	char *s=((struct autofs_symlink *)dentry->d_inode->i_private)->data;
 	nd_set_link(nd, s);
 	return NULL;
 }
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 480ab178cba5..b13f32c8aeee 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -94,7 +94,6 @@ struct autofs_wait_queue {
 
 struct autofs_sb_info {
 	u32 magic;
-	struct dentry *root;
 	int pipefd;
 	struct file *pipe;
 	pid_t oz_pgrp;
@@ -229,4 +228,4 @@ out:
 }
 
 void autofs4_dentry_release(struct dentry *);
-
+extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index 5d9193332bef..723a1c5e361b 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -24,7 +24,7 @@ static struct file_system_type autofs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "autofs",
 	.get_sb		= autofs_get_sb,
-	.kill_sb	= kill_anon_super,
+	.kill_sb	= autofs4_kill_sb,
 };
 
 static int __init init_autofs4_fs(void)
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 11a6a9ae51b7..51fd8595bf85 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -96,7 +96,7 @@ void autofs4_free_ino(struct autofs_info *ino)
  */
 static void autofs4_force_release(struct autofs_sb_info *sbi)
 {
-	struct dentry *this_parent = sbi->root;
+	struct dentry *this_parent = sbi->sb->s_root;
 	struct list_head *next;
 
 	spin_lock(&dcache_lock);
@@ -127,7 +127,7 @@ resume:
 		spin_lock(&dcache_lock);
 	}
 
-	if (this_parent != sbi->root) {
+	if (this_parent != sbi->sb->s_root) {
 		struct dentry *dentry = this_parent;
 
 		next = this_parent->d_u.d_child.next;
@@ -140,15 +140,9 @@ resume:
 		goto resume;
 	}
 	spin_unlock(&dcache_lock);
-
-	dput(sbi->root);
-	sbi->root = NULL;
-	shrink_dcache_sb(sbi->sb);
-
-	return;
 }
 
-static void autofs4_put_super(struct super_block *sb)
+void autofs4_kill_sb(struct super_block *sb)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(sb);
 
@@ -163,6 +157,7 @@ static void autofs4_put_super(struct super_block *sb)
 	kfree(sbi);
 
 	DPRINTK("shutting down");
+	kill_anon_super(sb);
 }
 
 static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
@@ -189,7 +184,6 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
 }
 
 static struct super_operations autofs4_sops = {
-	.put_super	= autofs4_put_super,
 	.statfs		= simple_statfs,
 	.show_options	= autofs4_show_options,
 };
@@ -315,7 +309,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 
 	s->s_fs_info = sbi;
 	sbi->magic = AUTOFS_SBI_MAGIC;
-	sbi->root = NULL;
 	sbi->pipefd = -1;
 	sbi->catatonic = 0;
 	sbi->exp_timeout = 0;
@@ -397,13 +390,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	sbi->pipefd = pipefd;
 
 	/*
-	 * Take a reference to the root dentry so we get a chance to
-	 * clean up the dentry tree on umount.
-	 * See autofs4_force_release.
-	 */
-	sbi->root = dget(root);
-
-	/*
 	 * Success! Install the root dentry now to indicate completion.
 	 */
 	s->s_root = root;
@@ -447,7 +433,6 @@ struct inode *autofs4_get_inode(struct super_block *sb,
 		inode->i_uid = 0;
 		inode->i_gid = 0;
 	}
-	inode->i_blksize = PAGE_CACHE_SIZE;
 	inode->i_blocks = 0;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 5100f984783f..c1493524da4d 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -137,7 +137,9 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
 		nd.flags = LOOKUP_DIRECTORY;
 		ret = (dentry->d_op->d_revalidate)(dentry, &nd);
 
-		if (!ret) {
+		if (ret <= 0) {
+			if (ret < 0)
+				status = ret;
 			dcache_dir_close(inode, file);
 			goto out;
 		}
@@ -279,9 +281,6 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
 
 		DPRINTK("mount done status=%d", status);
 
-		if (status && dentry->d_inode)
-			return status; /* Try to get the kernel to invalidate this dentry */
-
 		/* Turn this into a real negative dentry? */
 		if (status == -ENOENT) {
 			spin_lock(&dentry->d_lock);
@@ -357,7 +356,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 	 * don't try to mount it again.
 	 */
 	spin_lock(&dcache_lock);
-	if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
+	if (!d_mountpoint(dentry) && __simple_empty(dentry)) {
 		spin_unlock(&dcache_lock);
 
 		status = try_to_fill_dentry(dentry, 0);
@@ -400,13 +399,23 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
 	int oz_mode = autofs4_oz_mode(sbi);
 	int flags = nd ? nd->flags : 0;
-	int status = 0;
+	int status = 1;
 
 	/* Pending dentry */
 	if (autofs4_ispending(dentry)) {
-		if (!oz_mode)
-			status = try_to_fill_dentry(dentry, flags);
-		return !status;
+		/* The daemon never causes a mount to trigger */
+		if (oz_mode)
+			return 1;
+
+		/*
+		 * A zero status is success otherwise we have a
+		 * negative error code.
+		 */
+		status = try_to_fill_dentry(dentry, flags);
+		if (status == 0)
+				return 1;
+
+		return status;
 	}
 
 	/* Negative dentry.. invalidate if "old" */
@@ -421,9 +430,19 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 		DPRINTK("dentry=%p %.*s, emptydir",
 			 dentry, dentry->d_name.len, dentry->d_name.name);
 		spin_unlock(&dcache_lock);
-		if (!oz_mode)
-			status = try_to_fill_dentry(dentry, flags);
-		return !status;
+		/* The daemon never causes a mount to trigger */
+		if (oz_mode)
+			return 1;
+
+		/*
+		 * A zero status is success otherwise we have a
+		 * negative error code.
+		 */
+		status = try_to_fill_dentry(dentry, flags);
+		if (status == 0)
+			return 1;
+
+		return status;
 	}
 	spin_unlock(&dcache_lock);
 
@@ -518,6 +537,9 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
 			    return ERR_PTR(-ERESTARTNOINTR);
 			}
 		}
+		spin_lock(&dentry->d_lock);
+		dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
+		spin_unlock(&dentry->d_lock);
 	}
 
 	/*
@@ -616,7 +638,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 	dput(ino->dentry);
 
 	dentry->d_inode->i_size = 0;
-	dentry->d_inode->i_nlink = 0;
+	clear_nlink(dentry->d_inode);
 
 	dir->i_mtime = CURRENT_TIME;
 
@@ -651,10 +673,10 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 	}
 	dput(ino->dentry);
 	dentry->d_inode->i_size = 0;
-	dentry->d_inode->i_nlink = 0;
+	clear_nlink(dentry->d_inode);
 
 	if (dir->i_nlink)
-		dir->i_nlink--;
+		drop_nlink(dir);
 
 	return 0;
 }
@@ -691,7 +713,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (p_ino && dentry->d_parent != dentry)
 		atomic_inc(&p_ino->count);
 	ino->inode = inode;
-	dir->i_nlink++;
+	inc_nlink(dir);
 	dir->i_mtime = CURRENT_TIME;
 
 	return 0;
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index ce103e7b0bc3..c0a6c8d445c7 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -45,7 +45,6 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
 		fput(sbi->pipe);	/* Close the pipe */
 		sbi->pipe = NULL;
 	}
-	shrink_dcache_sb(sbi->sb);
 }
 
 static int autofs4_write(struct file *file, const void *addr, int bytes)
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 80599ae33966..34e6d7b220c3 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -40,8 +40,6 @@ static const struct file_operations bad_file_ops =
 	.aio_fsync	= EIO_ERROR,
 	.fasync		= EIO_ERROR,
 	.lock		= EIO_ERROR,
-	.readv		= EIO_ERROR,
-	.writev		= EIO_ERROR,
 	.sendfile	= EIO_ERROR,
 	.sendpage	= EIO_ERROR,
 	.get_unmapped_area = EIO_ERROR,
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index 057a2c3d73b7..d9a40abda6b7 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -94,7 +94,7 @@ void befs_debug(const struct super_block *sb, const char *fmt, ...);
 
 void befs_dump_super_block(const struct super_block *sb, befs_super_block *);
 void befs_dump_inode(const struct super_block *sb, befs_inode *);
-void befs_dump_index_entry(const struct super_block *sb, befs_btree_super *);
+void befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super *);
 void befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead *);
 /****************************/
 
@@ -136,7 +136,7 @@ blockno2iaddr(struct super_block *sb, befs_blocknr_t blockno)
 static inline unsigned int
 befs_iaddrs_per_block(struct super_block *sb)
 {
-	return BEFS_SB(sb)->block_size / sizeof (befs_inode_addr);
+	return BEFS_SB(sb)->block_size / sizeof (befs_disk_inode_addr);
 }
 
 static inline int
@@ -151,4 +151,6 @@ befs_brun_size(struct super_block *sb, befs_block_run run)
 	return BEFS_SB(sb)->block_size * run.len;
 }
 
+#include "endian.h"
+
 #endif				/* _LINUX_BEFS_H */
diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h
index 9095518e918d..e2595c2c403a 100644
--- a/fs/befs/befs_fs_types.h
+++ b/fs/befs/befs_fs_types.h
@@ -1,5 +1,5 @@
 /*
- * include/linux/befs_fs_types.h
+ * fs/befs/befs_fs_types.h
  *
  * Copyright (C) 2001 Will Dyson (will@cs.earlham.edu)
  *
@@ -79,17 +79,27 @@ enum inode_flags {
  * On-Disk datastructures of BeFS
  */
 
+typedef u64 __bitwise fs64;
+typedef u32 __bitwise fs32;
+typedef u16 __bitwise fs16;
+
 typedef u64 befs_off_t;
-typedef u64 befs_time_t;
-typedef void befs_binode_etc;
+typedef fs64 befs_time_t;
 
 /* Block runs */
 typedef struct {
+	fs32 allocation_group;
+	fs16 start;
+	fs16 len;
+} PACKED befs_disk_block_run;
+
+typedef struct {
 	u32 allocation_group;
 	u16 start;
 	u16 len;
 } PACKED befs_block_run;
 
+typedef befs_disk_block_run befs_disk_inode_addr;
 typedef befs_block_run befs_inode_addr;
 
 /*
@@ -97,31 +107,31 @@ typedef befs_block_run befs_inode_addr;
  */
 typedef struct {
 	char name[B_OS_NAME_LENGTH];
-	u32 magic1;
-	u32 fs_byte_order;
+	fs32 magic1;
+	fs32 fs_byte_order;
 
-	u32 block_size;
-	u32 block_shift;
+	fs32 block_size;
+	fs32 block_shift;
 
-	befs_off_t num_blocks;
-	befs_off_t used_blocks;
+	fs64 num_blocks;
+	fs64 used_blocks;
 
-	u32 inode_size;
+	fs32 inode_size;
 
-	u32 magic2;
-	u32 blocks_per_ag;
-	u32 ag_shift;
-	u32 num_ags;
+	fs32 magic2;
+	fs32 blocks_per_ag;
+	fs32 ag_shift;
+	fs32 num_ags;
 
-	u32 flags;
+	fs32 flags;
 
-	befs_block_run log_blocks;
-	befs_off_t log_start;
-	befs_off_t log_end;
+	befs_disk_block_run log_blocks;
+	fs64 log_start;
+	fs64 log_end;
 
-	u32 magic3;
-	befs_inode_addr root_dir;
-	befs_inode_addr indices;
+	fs32 magic3;
+	befs_disk_inode_addr root_dir;
+	befs_disk_inode_addr indices;
 
 } PACKED befs_super_block;
 
@@ -130,6 +140,16 @@ typedef struct {
  * be longer than one block!
  */
 typedef struct {
+	befs_disk_block_run direct[BEFS_NUM_DIRECT_BLOCKS];
+	fs64 max_direct_range;
+	befs_disk_block_run indirect;
+	fs64 max_indirect_range;
+	befs_disk_block_run double_indirect;
+	fs64 max_double_indirect_range;
+	fs64 size;
+} PACKED befs_disk_data_stream;
+
+typedef struct {
 	befs_block_run direct[BEFS_NUM_DIRECT_BLOCKS];
 	befs_off_t max_direct_range;
 	befs_block_run indirect;
@@ -141,35 +161,35 @@ typedef struct {
 
 /* Attribute */
 typedef struct {
-	u32 type;
-	u16 name_size;
-	u16 data_size;
+	fs32 type;
+	fs16 name_size;
+	fs16 data_size;
 	char name[1];
 } PACKED befs_small_data;
 
 /* Inode structure */
 typedef struct {
-	u32 magic1;
-	befs_inode_addr inode_num;
-	u32 uid;
-	u32 gid;
-	u32 mode;
-	u32 flags;
+	fs32 magic1;
+	befs_disk_inode_addr inode_num;
+	fs32 uid;
+	fs32 gid;
+	fs32 mode;
+	fs32 flags;
 	befs_time_t create_time;
 	befs_time_t last_modified_time;
-	befs_inode_addr parent;
-	befs_inode_addr attributes;
-	u32 type;
+	befs_disk_inode_addr parent;
+	befs_disk_inode_addr attributes;
+	fs32 type;
 
-	u32 inode_size;
-	u32 etc;		/* not use */
+	fs32 inode_size;
+	fs32 etc;		/* not use */
 
 	union {
-		befs_data_stream datastream;
+		befs_disk_data_stream datastream;
 		char symlink[BEFS_SYMLINK_LEN];
 	} data;
 
-	u32 pad[4];		/* not use */
+	fs32 pad[4];		/* not use */
 	befs_small_data small_data[1];
 } PACKED befs_inode;
 
@@ -190,6 +210,16 @@ enum btree_types {
 };
 
 typedef struct {
+	fs32 magic;
+	fs32 node_size;
+	fs32 max_depth;
+	fs32 data_type;
+	fs64 root_node_ptr;
+	fs64 free_node_ptr;
+	fs64 max_size;
+} PACKED befs_disk_btree_super;
+
+typedef struct {
 	u32 magic;
 	u32 node_size;
 	u32 max_depth;
@@ -203,11 +233,19 @@ typedef struct {
  * Header stucture of each btree node
  */
 typedef struct {
+	fs64 left;
+	fs64 right;
+	fs64 overflow;
+	fs16 all_key_count;
+	fs16 all_key_length;
+} PACKED befs_btree_nodehead;
+
+typedef struct {
 	befs_off_t left;
 	befs_off_t right;
 	befs_off_t overflow;
 	u16 all_key_count;
 	u16 all_key_length;
-} PACKED befs_btree_nodehead;
+} PACKED befs_host_btree_nodehead;
 
 #endif				/* _LINUX_BEFS_FS_TYPES */
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 76e219799409..81b042ee24e6 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -30,7 +30,6 @@
 #include "befs.h"
 #include "btree.h"
 #include "datastream.h"
-#include "endian.h"
 
 /*
  * The btree functions in this file are built on top of the
@@ -80,7 +79,7 @@
  * In memory structure of each btree node
  */
 typedef struct {
-	befs_btree_nodehead head;	/* head of node converted to cpu byteorder */
+	befs_host_btree_nodehead head;	/* head of node converted to cpu byteorder */
 	struct buffer_head *bh;
 	befs_btree_nodehead *od_node;	/* on disk node */
 } befs_btree_node;
@@ -102,9 +101,9 @@ static int befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
 
 static int befs_leafnode(befs_btree_node * node);
 
-static u16 *befs_bt_keylen_index(befs_btree_node * node);
+static fs16 *befs_bt_keylen_index(befs_btree_node * node);
 
-static befs_off_t *befs_bt_valarray(befs_btree_node * node);
+static fs64 *befs_bt_valarray(befs_btree_node * node);
 
 static char *befs_bt_keydata(befs_btree_node * node);
 
@@ -136,7 +135,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
 		   befs_btree_super * sup)
 {
 	struct buffer_head *bh = NULL;
-	befs_btree_super *od_sup = NULL;
+	befs_disk_btree_super *od_sup = NULL;
 
 	befs_debug(sb, "---> befs_btree_read_super()");
 
@@ -146,7 +145,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
 		befs_error(sb, "Couldn't read index header.");
 		goto error;
 	}
-	od_sup = (befs_btree_super *) bh->b_data;
+	od_sup = (befs_disk_btree_super *) bh->b_data;
 	befs_dump_index_entry(sb, od_sup);
 
 	sup->magic = fs32_to_cpu(sb, od_sup->magic);
@@ -342,7 +341,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
 	u16 keylen;
 	int findkey_len;
 	char *thiskey;
-	befs_off_t *valarray;
+	fs64 *valarray;
 
 	befs_debug(sb, "---> befs_find_key() %s", findkey);
 
@@ -422,7 +421,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
 	befs_btree_super bt_super;
 	befs_off_t node_off = 0;
 	int cur_key;
-	befs_off_t *valarray;
+	fs64 *valarray;
 	char *keystart;
 	u16 keylen;
 	int res;
@@ -572,7 +571,7 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
 				   this_node->head.overflow);
 			*node_off = this_node->head.overflow;
 		} else {
-			befs_off_t *valarray = befs_bt_valarray(this_node);
+			fs64 *valarray = befs_bt_valarray(this_node);
 			*node_off = fs64_to_cpu(sb, valarray[0]);
 		}
 		if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
@@ -622,7 +621,7 @@ befs_leafnode(befs_btree_node * node)
  *
  * Except that rounding up to 8 works, and rounding up to 4 doesn't.
  */
-static u16 *
+static fs16 *
 befs_bt_keylen_index(befs_btree_node * node)
 {
 	const int keylen_align = 8;
@@ -633,7 +632,7 @@ befs_bt_keylen_index(befs_btree_node * node)
 	if (tmp)
 		off += keylen_align - tmp;
 
-	return (u16 *) ((void *) node->od_node + off);
+	return (fs16 *) ((void *) node->od_node + off);
 }
 
 /**
@@ -643,13 +642,13 @@ befs_bt_keylen_index(befs_btree_node * node)
  * Returns a pointer to the start of the value array
  * of the node pointed to by the node header
  */
-static befs_off_t *
+static fs64 *
 befs_bt_valarray(befs_btree_node * node)
 {
 	void *keylen_index_start = (void *) befs_bt_keylen_index(node);
-	size_t keylen_index_size = node->head.all_key_count * sizeof (u16);
+	size_t keylen_index_size = node->head.all_key_count * sizeof (fs16);
 
-	return (befs_off_t *) (keylen_index_start + keylen_index_size);
+	return (fs64 *) (keylen_index_start + keylen_index_size);
 }
 
 /**
@@ -681,7 +680,7 @@ befs_bt_get_key(struct super_block *sb, befs_btree_node * node,
 {
 	int prev_key_end;
 	char *keystart;
-	u16 *keylen_index;
+	fs16 *keylen_index;
 
 	if (index < 0 || index > node->head.all_key_count) {
 		*keylen = 0;
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index b7d6b920f65f..aacb4da6298a 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -18,7 +18,6 @@
 #include "befs.h"
 #include "datastream.h"
 #include "io.h"
-#include "endian.h"
 
 const befs_inode_addr BAD_IADDR = { 0, 0, 0 };
 
@@ -312,7 +311,7 @@ befs_find_brun_indirect(struct super_block *sb,
 	befs_blocknr_t indir_start_blk;
 	befs_blocknr_t search_blk;
 	struct buffer_head *indirblock;
-	befs_block_run *array;
+	befs_disk_block_run *array;
 
 	befs_block_run indirect = data->indirect;
 	befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect);
@@ -334,7 +333,7 @@ befs_find_brun_indirect(struct super_block *sb,
 			return BEFS_ERR;
 		}
 
-		array = (befs_block_run *) indirblock->b_data;
+		array = (befs_disk_block_run *) indirblock->b_data;
 
 		for (j = 0; j < arraylen; ++j) {
 			int len = fs16_to_cpu(sb, array[j].len);
@@ -427,7 +426,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
 	struct buffer_head *dbl_indir_block;
 	struct buffer_head *indir_block;
 	befs_block_run indir_run;
-	befs_inode_addr *iaddr_array = NULL;
+	befs_disk_inode_addr *iaddr_array = NULL;
 	befs_sb_info *befs_sb = BEFS_SB(sb);
 
 	befs_blocknr_t indir_start_blk =
@@ -482,7 +481,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
 
 	dbl_block_indx =
 	    dblindir_indx - (dbl_which_block * befs_iaddrs_per_block(sb));
-	iaddr_array = (befs_inode_addr *) dbl_indir_block->b_data;
+	iaddr_array = (befs_disk_inode_addr *) dbl_indir_block->b_data;
 	indir_run = fsrun_to_cpu(sb, iaddr_array[dbl_block_indx]);
 	brelse(dbl_indir_block);
 	iaddr_array = NULL;
@@ -507,7 +506,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
 	}
 
 	block_indx = indir_indx - (which_block * befs_iaddrs_per_block(sb));
-	iaddr_array = (befs_inode_addr *) indir_block->b_data;
+	iaddr_array = (befs_disk_inode_addr *) indir_block->b_data;
 	*run = fsrun_to_cpu(sb, iaddr_array[block_indx]);
 	brelse(indir_block);
 	iaddr_array = NULL;
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index 875cc0aa318c..e831a8f30849 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -21,7 +21,6 @@
 #endif				/* __KERNEL__ */
 
 #include "befs.h"
-#include "endian.h"
 
 #define ERRBUFSIZE 1024
 
@@ -125,7 +124,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
 	befs_debug(sb, "  type %08x", fs32_to_cpu(sb, inode->type));
 	befs_debug(sb, "  inode_size %u", fs32_to_cpu(sb, inode->inode_size));
 
-	if (S_ISLNK(inode->mode)) {
+	if (S_ISLNK(fs32_to_cpu(sb, inode->mode))) {
 		befs_debug(sb, "  Symbolic link [%s]", inode->data.symlink);
 	} else {
 		int i;
@@ -231,21 +230,20 @@ befs_dump_small_data(const struct super_block *sb, befs_small_data * sd)
 
 /* unused */
 void
-befs_dump_run(const struct super_block *sb, befs_block_run run)
+befs_dump_run(const struct super_block *sb, befs_disk_block_run run)
 {
 #ifdef CONFIG_BEFS_DEBUG
 
-	run = fsrun_to_cpu(sb, run);
+	befs_block_run n = fsrun_to_cpu(sb, run);
 
-	befs_debug(sb, "[%u, %hu, %hu]",
-		   run.allocation_group, run.start, run.len);
+	befs_debug(sb, "[%u, %hu, %hu]", n.allocation_group, n.start, n.len);
 
 #endif				//CONFIG_BEFS_DEBUG
 }
 #endif  /*  0  */
 
 void
-befs_dump_index_entry(const struct super_block *sb, befs_btree_super * super)
+befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super * super)
 {
 #ifdef CONFIG_BEFS_DEBUG
 
diff --git a/fs/befs/endian.h b/fs/befs/endian.h
index 9ecaea4e3325..e254a20869f4 100644
--- a/fs/befs/endian.h
+++ b/fs/befs/endian.h
@@ -10,85 +10,84 @@
 #define LINUX_BEFS_ENDIAN
 
 #include <linux/byteorder/generic.h>
-#include "befs.h"
 
 static inline u64
-fs64_to_cpu(const struct super_block *sb, u64 n)
+fs64_to_cpu(const struct super_block *sb, fs64 n)
 {
 	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
-		return le64_to_cpu(n);
+		return le64_to_cpu((__force __le64)n);
 	else
-		return be64_to_cpu(n);
+		return be64_to_cpu((__force __be64)n);
 }
 
-static inline u64
+static inline fs64
 cpu_to_fs64(const struct super_block *sb, u64 n)
 {
 	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
-		return cpu_to_le64(n);
+		return (__force fs64)cpu_to_le64(n);
 	else
-		return cpu_to_be64(n);
+		return (__force fs64)cpu_to_be64(n);
 }
 
 static inline u32
-fs32_to_cpu(const struct super_block *sb, u32 n)
+fs32_to_cpu(const struct super_block *sb, fs32 n)
 {
 	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
-		return le32_to_cpu(n);
+		return le32_to_cpu((__force __le32)n);
 	else
-		return be32_to_cpu(n);
+		return be32_to_cpu((__force __be32)n);
 }
 
-static inline u32
+static inline fs32
 cpu_to_fs32(const struct super_block *sb, u32 n)
 {
 	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
-		return cpu_to_le32(n);
+		return (__force fs32)cpu_to_le32(n);
 	else
-		return cpu_to_be32(n);
+		return (__force fs32)cpu_to_be32(n);
 }
 
 static inline u16
-fs16_to_cpu(const struct super_block *sb, u16 n)
+fs16_to_cpu(const struct super_block *sb, fs16 n)
 {
 	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
-		return le16_to_cpu(n);
+		return le16_to_cpu((__force __le16)n);
 	else
-		return be16_to_cpu(n);
+		return be16_to_cpu((__force __be16)n);
 }
 
-static inline u16
+static inline fs16
 cpu_to_fs16(const struct super_block *sb, u16 n)
 {
 	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE)
-		return cpu_to_le16(n);
+		return (__force fs16)cpu_to_le16(n);
 	else
-		return cpu_to_be16(n);
+		return (__force fs16)cpu_to_be16(n);
 }
 
 /* Composite types below here */
 
 static inline befs_block_run
-fsrun_to_cpu(const struct super_block *sb, befs_block_run n)
+fsrun_to_cpu(const struct super_block *sb, befs_disk_block_run n)
 {
 	befs_block_run run;
 
 	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) {
-		run.allocation_group = le32_to_cpu(n.allocation_group);
-		run.start = le16_to_cpu(n.start);
-		run.len = le16_to_cpu(n.len);
+		run.allocation_group = le32_to_cpu((__force __le32)n.allocation_group);
+		run.start = le16_to_cpu((__force __le16)n.start);
+		run.len = le16_to_cpu((__force __le16)n.len);
 	} else {
-		run.allocation_group = be32_to_cpu(n.allocation_group);
-		run.start = be16_to_cpu(n.start);
-		run.len = be16_to_cpu(n.len);
+		run.allocation_group = be32_to_cpu((__force __be32)n.allocation_group);
+		run.start = be16_to_cpu((__force __be16)n.start);
+		run.len = be16_to_cpu((__force __be16)n.len);
 	}
 	return run;
 }
 
-static inline befs_block_run
+static inline befs_disk_block_run
 cpu_to_fsrun(const struct super_block *sb, befs_block_run n)
 {
-	befs_block_run run;
+	befs_disk_block_run run;
 
 	if (BEFS_SB(sb)->byte_order == BEFS_BYTESEX_LE) {
 		run.allocation_group = cpu_to_le32(n.allocation_group);
@@ -103,7 +102,7 @@ cpu_to_fsrun(const struct super_block *sb, befs_block_run n)
 }
 
 static inline befs_data_stream
-fsds_to_cpu(const struct super_block *sb, befs_data_stream n)
+fsds_to_cpu(const struct super_block *sb, befs_disk_data_stream n)
 {
 	befs_data_stream data;
 	int i;
diff --git a/fs/befs/inode.c b/fs/befs/inode.c
index d41c9247ae8a..94c17f9a9576 100644
--- a/fs/befs/inode.c
+++ b/fs/befs/inode.c
@@ -8,7 +8,6 @@
 
 #include "befs.h"
 #include "inode.h"
-#include "endian.h"
 
 /*
 	Validates the correctness of the befs inode
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 50cfca5c7efd..07f7144f0e2e 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -22,7 +22,6 @@
 #include "datastream.h"
 #include "super.h"
 #include "io.h"
-#include "endian.h"
 
 MODULE_DESCRIPTION("BeOS File System (BeFS) driver");
 MODULE_AUTHOR("Will Dyson");
@@ -365,7 +364,6 @@ befs_read_inode(struct inode *inode)
 	inode->i_mtime.tv_nsec = 0;   /* lower 16 bits are not a time */	
 	inode->i_ctime = inode->i_mtime;
 	inode->i_atime = inode->i_mtime;
-	inode->i_blksize = befs_sb->block_size;
 
 	befs_ino->i_inode_num = fsrun_to_cpu(sb, raw_inode->inode_num);
 	befs_ino->i_parent = fsrun_to_cpu(sb, raw_inode->parent);
@@ -446,9 +444,7 @@ befs_init_inodecache(void)
 static void
 befs_destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(befs_inode_cachep))
-		printk(KERN_ERR "befs_destroy_inodecache: "
-		       "not all structures were freed\n");
+	kmem_cache_destroy(befs_inode_cachep);
 }
 
 /*
diff --git a/fs/befs/super.c b/fs/befs/super.c
index 4557acbac528..8c3401ff6d6a 100644
--- a/fs/befs/super.c
+++ b/fs/befs/super.c
@@ -11,7 +11,6 @@
 
 #include "befs.h"
 #include "super.h"
-#include "endian.h"
 
 /**
  * load_befs_sb -- Read from disk and properly byteswap all the fields
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 26fad9621738..a650f1d0b85e 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -102,7 +102,7 @@ static int bfs_create(struct inode * dir, struct dentry * dentry, int mode,
 	inode->i_uid = current->fsuid;
 	inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-	inode->i_blocks = inode->i_blksize = 0;
+	inode->i_blocks = 0;
 	inode->i_op = &bfs_file_inops;
 	inode->i_fop = &bfs_file_operations;
 	inode->i_mapping->a_ops = &bfs_aops;
@@ -117,8 +117,7 @@ static int bfs_create(struct inode * dir, struct dentry * dentry, int mode,
 
 	err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len, inode->i_ino);
 	if (err) {
-		inode->i_nlink--;
-		mark_inode_dirty(inode);
+		inode_dec_link_count(inode);
 		iput(inode);
 		unlock_kernel();
 		return err;
@@ -164,7 +163,7 @@ static int bfs_link(struct dentry * old, struct inode * dir, struct dentry * new
 		unlock_kernel();
 		return err;
 	}
-	inode->i_nlink++;
+	inc_nlink(inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
 	atomic_inc(&inode->i_count);
@@ -196,9 +195,8 @@ static int bfs_unlink(struct inode * dir, struct dentry * dentry)
 	mark_buffer_dirty(bh);
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
 	mark_inode_dirty(dir);
-	inode->i_nlink--;
 	inode->i_ctime = dir->i_ctime;
-	mark_inode_dirty(inode);
+	inode_dec_link_count(inode);
 	error = 0;
 
 out_brelse:
@@ -249,9 +247,8 @@ static int bfs_rename(struct inode * old_dir, struct dentry * old_dentry,
 	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
 	mark_inode_dirty(old_dir);
 	if (new_inode) {
-		new_inode->i_nlink--;
 		new_inode->i_ctime = CURRENT_TIME_SEC;
-		mark_inode_dirty(new_inode);
+		inode_dec_link_count(new_inode);
 	}
 	mark_buffer_dirty(old_bh);
 	error = 0;
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 3d5aca28a0a0..a9164a87f8de 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -19,8 +19,10 @@
 
 const struct file_operations bfs_file_operations = {
 	.llseek 	= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.sendfile	= generic_file_sendfile,
 };
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index cf74f3d4d966..ed27ffb3459e 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -76,7 +76,6 @@ static void bfs_read_inode(struct inode * inode)
 	inode->i_size = BFS_FILESIZE(di);
 	inode->i_blocks = BFS_FILEBLOCKS(di);
         if (inode->i_size || inode->i_blocks) dprintf("Registered inode with %lld size, %ld blocks\n", inode->i_size, inode->i_blocks);
-	inode->i_blksize = PAGE_SIZE;
 	inode->i_atime.tv_sec =  le32_to_cpu(di->i_atime);
 	inode->i_mtime.tv_sec =  le32_to_cpu(di->i_mtime);
 	inode->i_ctime.tv_sec =  le32_to_cpu(di->i_ctime);
@@ -268,8 +267,7 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(bfs_inode_cachep))
-		printk(KERN_INFO "bfs_inode_cache: not all structures were freed\n");
+	kmem_cache_destroy(bfs_inode_cachep);
 }
 
 static struct super_operations bfs_sops = {
@@ -311,11 +309,10 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 	unsigned i, imap_len;
 	struct bfs_sb_info * info;
 
-	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
 		return -ENOMEM;
 	s->s_fs_info = info;
-	memset(info, 0, sizeof(*info));
 
 	sb_set_blocksize(s, BFS_BSIZE);
 
@@ -338,10 +335,9 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 			+ BFS_ROOT_INO - 1;
 
 	imap_len = info->si_lasti/8 + 1;
-	info->si_imap = kmalloc(imap_len, GFP_KERNEL);
+	info->si_imap = kzalloc(imap_len, GFP_KERNEL);
 	if (!info->si_imap)
 		goto out;
-	memset(info->si_imap, 0, imap_len);
 	for (i=0; i<BFS_ROOT_INO; i++) 
 		set_bit(i, info->si_imap);
 
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index f312103434d4..517e111bb7ef 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -278,6 +278,13 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		return -ENOEXEC;
 	}
 
+	/*
+	 * Requires a mmap handler. This prevents people from using a.out
+	 * as part of an exploit attack against /proc-related vulnerabilities.
+	 */
+	if (!bprm->file->f_op || !bprm->file->f_op->mmap)
+		return -ENOEXEC;
+
 	fd_offset = N_TXTOFF(ex);
 
 	/* Check initial limits. This avoids letting people circumvent
@@ -476,6 +483,13 @@ static int load_aout_library(struct file *file)
 		goto out;
 	}
 
+	/*
+	 * Requires a mmap handler. This prevents people from using a.out
+	 * as part of an exploit attack against /proc-related vulnerabilities.
+	 */
+	if (!file->f_op || !file->f_op->mmap)
+		goto out;
+
 	if (N_FLAGS(ex))
 		goto out;
 
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index dfd8cfb7fb5d..79b05a1a4365 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -46,7 +46,6 @@
 static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
 static int load_elf_library(struct file *);
 static unsigned long elf_map (struct file *, unsigned long, struct elf_phdr *, int, int);
-extern int dump_fpu (struct pt_regs *, elf_fpregset_t *);
 
 #ifndef elf_addr_t
 #define elf_addr_t unsigned long
@@ -1038,10 +1037,8 @@ out_free_interp:
 out_free_file:
 	sys_close(elf_exec_fileno);
 out_free_fh:
-	if (files) {
-		put_files_struct(current->files);
-		current->files = files;
-	}
+	if (files)
+		reset_files_struct(current, files);
 out_free_ph:
 	kfree(elf_phdata);
 	goto out;
@@ -1154,11 +1151,23 @@ static int dump_write(struct file *file, const void *addr, int nr)
 
 static int dump_seek(struct file *file, loff_t off)
 {
-	if (file->f_op->llseek) {
-		if (file->f_op->llseek(file, off, 0) != off)
+	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+		if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
 			return 0;
-	} else
-		file->f_pos = off;
+	} else {
+		char *buf = (char *)get_zeroed_page(GFP_KERNEL);
+		if (!buf)
+			return 0;
+		while (off > 0) {
+			unsigned long n = off;
+			if (n > PAGE_SIZE)
+				n = PAGE_SIZE;
+			if (!dump_write(file, buf, n))
+				return 0;
+			off -= n;
+		}
+		free_page((unsigned long)buf);
+	}
 	return 1;
 }
 
@@ -1206,30 +1215,35 @@ static int notesize(struct memelfnote *en)
 	return sz;
 }
 
-#define DUMP_WRITE(addr, nr)	\
-	do { if (!dump_write(file, (addr), (nr))) return 0; } while(0)
-#define DUMP_SEEK(off)	\
-	do { if (!dump_seek(file, (off))) return 0; } while(0)
+#define DUMP_WRITE(addr, nr, foffset)	\
+	do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0)
 
-static int writenote(struct memelfnote *men, struct file *file)
+static int alignfile(struct file *file, loff_t *foffset)
 {
-	struct elf_note en;
+	static const char buf[4] = { 0, };
+	DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset);
+	return 1;
+}
 
+static int writenote(struct memelfnote *men, struct file *file,
+			loff_t *foffset)
+{
+	struct elf_note en;
 	en.n_namesz = strlen(men->name) + 1;
 	en.n_descsz = men->datasz;
 	en.n_type = men->type;
 
-	DUMP_WRITE(&en, sizeof(en));
-	DUMP_WRITE(men->name, en.n_namesz);
-	/* XXX - cast from long long to long to avoid need for libgcc.a */
-	DUMP_SEEK(roundup((unsigned long)file->f_pos, 4));	/* XXX */
-	DUMP_WRITE(men->data, men->datasz);
-	DUMP_SEEK(roundup((unsigned long)file->f_pos, 4));	/* XXX */
+	DUMP_WRITE(&en, sizeof(en), foffset);
+	DUMP_WRITE(men->name, en.n_namesz, foffset);
+	if (!alignfile(file, foffset))
+		return 0;
+	DUMP_WRITE(men->data, men->datasz, foffset);
+	if (!alignfile(file, foffset))
+		return 0;
 
 	return 1;
 }
 #undef DUMP_WRITE
-#undef DUMP_SEEK
 
 #define DUMP_WRITE(addr, nr)	\
 	if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
@@ -1429,7 +1443,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 	int i;
 	struct vm_area_struct *vma;
 	struct elfhdr *elf = NULL;
-	loff_t offset = 0, dataoff;
+	loff_t offset = 0, dataoff, foffset;
 	unsigned long limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
 	int numnote;
 	struct memelfnote *notes = NULL;
@@ -1481,20 +1495,19 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 
 	if (signr) {
 		struct elf_thread_status *tmp;
-		read_lock(&tasklist_lock);
+		rcu_read_lock();
 		do_each_thread(g,p)
 			if (current->mm == p->mm && current != p) {
 				tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
 				if (!tmp) {
-					read_unlock(&tasklist_lock);
+					rcu_read_unlock();
 					goto cleanup;
 				}
-				INIT_LIST_HEAD(&tmp->list);
 				tmp->thread = p;
 				list_add(&tmp->list, &thread_list);
 			}
 		while_each_thread(g,p);
-		read_unlock(&tasklist_lock);
+		rcu_read_unlock();
 		list_for_each(t, &thread_list) {
 			struct elf_thread_status *tmp;
 			int sz;
@@ -1556,7 +1569,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 
 	DUMP_WRITE(elf, sizeof(*elf));
 	offset += sizeof(*elf);				/* Elf header */
-	offset += (segs+1) * sizeof(struct elf_phdr);	/* Program headers */
+	offset += (segs + 1) * sizeof(struct elf_phdr); /* Program headers */
+	foffset = offset;
 
 	/* Write notes phdr entry */
 	{
@@ -1573,7 +1587,6 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 		DUMP_WRITE(&phdr, sizeof(phdr));
 	}
 
-	/* Page-align dumped data */
 	dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
 
 	/* Write program headers for segments dump */
@@ -1606,7 +1619,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 
  	/* write out the notes section */
 	for (i = 0; i < numnote; i++)
-		if (!writenote(notes + i, file))
+		if (!writenote(notes + i, file, &foffset))
 			goto end_coredump;
 
 	/* write out the thread status notes section */
@@ -1615,11 +1628,12 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 				list_entry(t, struct elf_thread_status, list);
 
 		for (i = 0; i < tmp->num_notes; i++)
-			if (!writenote(&tmp->notes[i], file))
+			if (!writenote(&tmp->notes[i], file, &foffset))
 				goto end_coredump;
 	}
- 
-	DUMP_SEEK(dataoff);
+
+	/* Align to page */
+	DUMP_SEEK(dataoff - foffset);
 
 	for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) {
 		unsigned long addr;
@@ -1635,10 +1649,10 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 
 			if (get_user_pages(current, current->mm, addr, 1, 0, 1,
 						&page, &vma) <= 0) {
-				DUMP_SEEK(file->f_pos + PAGE_SIZE);
+				DUMP_SEEK(PAGE_SIZE);
 			} else {
 				if (page == ZERO_PAGE(addr)) {
-					DUMP_SEEK(file->f_pos + PAGE_SIZE);
+					DUMP_SEEK(PAGE_SIZE);
 				} else {
 					void *kaddr;
 					flush_cache_page(vma, addr,
@@ -1662,13 +1676,6 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 	ELF_CORE_WRITE_EXTRA_DATA;
 #endif
 
-	if (file->f_pos != offset) {
-		/* Sanity check */
-		printk(KERN_WARNING
-		       "elf_core_dump: file->f_pos (%Ld) != offset (%Ld)\n",
-		       file->f_pos, offset);
-	}
-
 end_coredump:
 	set_fs(fs);
 
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2f3365829229..f86d5c9ce5eb 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1597,20 +1597,19 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
 
 	if (signr) {
 		struct elf_thread_status *tmp;
-		read_lock(&tasklist_lock);
+		rcu_read_lock();
 		do_each_thread(g,p)
 			if (current->mm == p->mm && current != p) {
 				tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
 				if (!tmp) {
-					read_unlock(&tasklist_lock);
+					rcu_read_unlock();
 					goto cleanup;
 				}
-				INIT_LIST_HEAD(&tmp->list);
 				tmp->thread = p;
 				list_add(&tmp->list, &thread_list);
 			}
 		while_each_thread(g,p);
-		read_unlock(&tasklist_lock);
+		rcu_read_unlock();
 		list_for_each(t, &thread_list) {
 			struct elf_thread_status *tmp;
 			int sz;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 34ebbc191e46..1713c48fef54 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -215,10 +215,8 @@ _error:
 	bprm->interp_flags = 0;
 	bprm->interp_data = 0;
 _unshare:
-	if (files) {
-		put_files_struct(current->files);
-		current->files = files;
-	}
+	if (files)
+		reset_files_struct(current, files);
 	goto _ret;
 }
 
@@ -507,7 +505,6 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
 		inode->i_mode = mode;
 		inode->i_uid = 0;
 		inode->i_gid = 0;
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime =
 			current_fs_time(inode->i_sb);
@@ -517,7 +514,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
 
 static void bm_clear_inode(struct inode *inode)
 {
-	kfree(inode->u.generic_ip);
+	kfree(inode->i_private);
 }
 
 static void kill_node(Node *e)
@@ -545,7 +542,7 @@ static void kill_node(Node *e)
 static ssize_t
 bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos)
 {
-	Node *e = file->f_dentry->d_inode->u.generic_ip;
+	Node *e = file->f_dentry->d_inode->i_private;
 	loff_t pos = *ppos;
 	ssize_t res;
 	char *page;
@@ -579,7 +576,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 				size_t count, loff_t *ppos)
 {
 	struct dentry *root;
-	Node *e = file->f_dentry->d_inode->u.generic_ip;
+	Node *e = file->f_dentry->d_inode->i_private;
 	int res = parse_command(buffer, count);
 
 	switch (res) {
@@ -646,7 +643,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	}
 
 	e->dentry = dget(dentry);
-	inode->u.generic_ip = e;
+	inode->i_private = e;
 	inode->i_fop = &bm_entry_operations;
 
 	d_instantiate(dentry, inode);
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 32b5d625ce9c..5bcdaaf4eae0 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -29,6 +29,7 @@
 #include <linux/personality.h>
 #include <linux/init.h>
 
+#include <asm/a.out.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
@@ -194,6 +195,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	unsigned long som_entry;
 	struct som_hdr *som_ex;
 	struct som_exec_auxhdr *hpuxhdr;
+	struct files_struct *files;
 
 	/* Get the exec-header */
 	som_ex = (struct som_hdr *) bprm->buf;
@@ -208,15 +210,27 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	size = som_ex->aux_header_size;
 	if (size > SOM_PAGESIZE)
 		goto out;
-	hpuxhdr = (struct som_exec_auxhdr *) kmalloc(size, GFP_KERNEL);
+	hpuxhdr = kmalloc(size, GFP_KERNEL);
 	if (!hpuxhdr)
 		goto out;
 
 	retval = kernel_read(bprm->file, som_ex->aux_header_location,
 			(char *) hpuxhdr, size);
+	if (retval != size) {
+		if (retval >= 0)
+			retval = -EIO;
+		goto out_free;
+	}
+
+	files = current->files; /* Refcounted so ok */
+	retval = unshare_files();
 	if (retval < 0)
 		goto out_free;
-#error "Fix security hole before enabling me"
+	if (files == current->files) {
+		put_files_struct(files);
+		files = NULL;
+	}
+
 	retval = get_unused_fd();
 	if (retval < 0)
 		goto out_free;
diff --git a/fs/bio.c b/fs/bio.c
index 6a0b9ad8f8c9..f95c8749499f 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -79,7 +79,6 @@ static struct bio_set *fs_bio_set;
 static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
 {
 	struct bio_vec *bvl;
-	struct biovec_slab *bp;
 
 	/*
 	 * see comment near bvec_array define!
@@ -98,10 +97,12 @@ static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned lon
 	 * idx now points to the pool we want to allocate from
 	 */
 
-	bp = bvec_slabs + *idx;
 	bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
-	if (bvl)
+	if (bvl) {
+		struct biovec_slab *bp = bvec_slabs + *idx;
+
 		memset(bvl, 0, bp->nr_vecs * sizeof(struct bio_vec));
+	}
 
 	return bvl;
 }
@@ -166,7 +167,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 
 		bio_init(bio);
 		if (likely(nr_iovecs)) {
-			unsigned long idx;
+			unsigned long idx = 0; /* shut up gcc */
 
 			bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
 			if (unlikely(!bvl)) {
@@ -1142,7 +1143,7 @@ static int biovec_create_pools(struct bio_set *bs, int pool_entries, int scale)
 		struct biovec_slab *bp = bvec_slabs + i;
 		mempool_t **bvp = bs->bvec_pools + i;
 
-		if (i >= scale)
+		if (pool_entries > 1 && i >= scale)
 			pool_entries >>= 1;
 
 		*bvp = mempool_create_slab_pool(pool_entries, bp->slab);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 045f98854f14..bc8f27cc4483 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -17,11 +17,13 @@
 #include <linux/module.h>
 #include <linux/blkpg.h>
 #include <linux/buffer_head.h>
+#include <linux/writeback.h>
 #include <linux/mpage.h>
 #include <linux/mount.h>
 #include <linux/uio.h>
 #include <linux/namei.h>
 #include <asm/uaccess.h>
+#include "internal.h"
 
 struct bdev_inode {
 	struct block_device bdev;
@@ -543,11 +545,11 @@ static struct kobject *bdev_get_holder(struct block_device *bdev)
 		return kobject_get(bdev->bd_disk->holder_dir);
 }
 
-static void add_symlink(struct kobject *from, struct kobject *to)
+static int add_symlink(struct kobject *from, struct kobject *to)
 {
 	if (!from || !to)
-		return;
-	sysfs_create_link(from, to, kobject_name(to));
+		return 0;
+	return sysfs_create_link(from, to, kobject_name(to));
 }
 
 static void del_symlink(struct kobject *from, struct kobject *to)
@@ -648,30 +650,38 @@ static void free_bd_holder(struct bd_holder *bo)
  * If there is no matching entry with @bo in @bdev->bd_holder_list,
  * add @bo to the list, create symlinks.
  *
- * Returns 1 if @bo was added to the list.
- * Returns 0 if @bo wasn't used by any reason and should be freed.
+ * Returns 0 if symlinks are created or already there.
+ * Returns -ve if something fails and @bo can be freed.
  */
 static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
 {
 	struct bd_holder *tmp;
+	int ret;
 
 	if (!bo)
-		return 0;
+		return -EINVAL;
 
 	list_for_each_entry(tmp, &bdev->bd_holder_list, list) {
 		if (tmp->sdir == bo->sdir) {
 			tmp->count++;
+			/* We've already done what we need to do here. */
+			free_bd_holder(bo);
 			return 0;
 		}
 	}
 
 	if (!bd_holder_grab_dirs(bdev, bo))
-		return 0;
+		return -EBUSY;
 
-	add_symlink(bo->sdir, bo->sdev);
-	add_symlink(bo->hdir, bo->hdev);
-	list_add_tail(&bo->list, &bdev->bd_holder_list);
-	return 1;
+	ret = add_symlink(bo->sdir, bo->sdev);
+	if (ret == 0) {
+		ret = add_symlink(bo->hdir, bo->hdev);
+		if (ret)
+			del_symlink(bo->sdir, bo->sdev);
+	}
+	if (ret == 0)
+		list_add_tail(&bo->list, &bdev->bd_holder_list);
+	return ret;
 }
 
 /**
@@ -741,7 +751,9 @@ static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
 
 	mutex_lock_nested(&bdev->bd_mutex, BD_MUTEX_PARTITION);
 	res = bd_claim(bdev, holder);
-	if (res || !add_bd_holder(bdev, bo))
+	if (res == 0)
+		res = add_bd_holder(bdev, bo);
+	if (res)
 		free_bd_holder(bo);
 	mutex_unlock(&bdev->bd_mutex);
 
@@ -1021,7 +1033,7 @@ do_open(struct block_device *bdev, struct file *file, unsigned int subclass)
 				rescan_partitions(bdev->bd_disk, bdev);
 		} else {
 			mutex_lock_nested(&bdev->bd_contains->bd_mutex,
-					  BD_MUTEX_PARTITION);
+					  BD_MUTEX_WHOLE);
 			bdev->bd_contains->bd_part_count++;
 			mutex_unlock(&bdev->bd_contains->bd_mutex);
 		}
@@ -1142,22 +1154,6 @@ static int blkdev_close(struct inode * inode, struct file * filp)
 	return blkdev_put(bdev);
 }
 
-static ssize_t blkdev_file_write(struct file *file, const char __user *buf,
-				   size_t count, loff_t *ppos)
-{
-	struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count };
-
-	return generic_file_write_nolock(file, &local_iov, 1, ppos);
-}
-
-static ssize_t blkdev_file_aio_write(struct kiocb *iocb, const char __user *buf,
-				   size_t count, loff_t pos)
-{
-	struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count };
-
-	return generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
-}
-
 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	return blkdev_ioctl(file->f_mapping->host, file, cmd, arg);
@@ -1177,18 +1173,16 @@ const struct file_operations def_blk_fops = {
 	.open		= blkdev_open,
 	.release	= blkdev_close,
 	.llseek		= block_llseek,
-	.read		= generic_file_read,
-	.write		= blkdev_file_write,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
   	.aio_read	= generic_file_aio_read,
-  	.aio_write	= blkdev_file_aio_write, 
+  	.aio_write	= generic_file_aio_write_nolock,
 	.mmap		= generic_file_mmap,
 	.fsync		= block_fsync,
 	.unlocked_ioctl	= block_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= compat_blkdev_ioctl,
 #endif
-	.readv		= generic_file_readv,
-	.writev		= generic_file_write_nolock,
 	.sendfile	= generic_file_sendfile,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
@@ -1303,3 +1297,24 @@ void close_bdev_excl(struct block_device *bdev)
 }
 
 EXPORT_SYMBOL(close_bdev_excl);
+
+int __invalidate_device(struct block_device *bdev)
+{
+	struct super_block *sb = get_super(bdev);
+	int res = 0;
+
+	if (sb) {
+		/*
+		 * no need to lock the super, get_super holds the
+		 * read mutex so the filesystem cannot go away
+		 * under us (->put_super runs with the write lock
+		 * hold).
+		 */
+		shrink_dcache_sb(sb);
+		res = invalidate_inodes(sb);
+		drop_super(sb);
+	}
+	invalidate_bdev(bdev, 0);
+	return res;
+}
+EXPORT_SYMBOL(__invalidate_device);
diff --git a/fs/buffer.c b/fs/buffer.c
index 3b6d701073e7..35527dca1dbc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -159,31 +159,6 @@ int sync_blockdev(struct block_device *bdev)
 }
 EXPORT_SYMBOL(sync_blockdev);
 
-static void __fsync_super(struct super_block *sb)
-{
-	sync_inodes_sb(sb, 0);
-	DQUOT_SYNC(sb);
-	lock_super(sb);
-	if (sb->s_dirt && sb->s_op->write_super)
-		sb->s_op->write_super(sb);
-	unlock_super(sb);
-	if (sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, 1);
-	sync_blockdev(sb->s_bdev);
-	sync_inodes_sb(sb, 1);
-}
-
-/*
- * Write out and wait upon all dirty data associated with this
- * superblock.  Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.
- */
-int fsync_super(struct super_block *sb)
-{
-	__fsync_super(sb);
-	return sync_blockdev(sb->s_bdev);
-}
-
 /*
  * Write out and wait upon all dirty data associated with this
  * device.   Filesystem data as well as the underlying block
@@ -260,118 +235,6 @@ void thaw_bdev(struct block_device *bdev, struct super_block *sb)
 EXPORT_SYMBOL(thaw_bdev);
 
 /*
- * sync everything.  Start out by waking pdflush, because that writes back
- * all queues in parallel.
- */
-static void do_sync(unsigned long wait)
-{
-	wakeup_pdflush(0);
-	sync_inodes(0);		/* All mappings, inodes and their blockdevs */
-	DQUOT_SYNC(NULL);
-	sync_supers();		/* Write the superblocks */
-	sync_filesystems(0);	/* Start syncing the filesystems */
-	sync_filesystems(wait);	/* Waitingly sync the filesystems */
-	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
-	if (!wait)
-		printk("Emergency Sync complete\n");
-	if (unlikely(laptop_mode))
-		laptop_sync_completion();
-}
-
-asmlinkage long sys_sync(void)
-{
-	do_sync(1);
-	return 0;
-}
-
-void emergency_sync(void)
-{
-	pdflush_operation(do_sync, 0);
-}
-
-/*
- * Generic function to fsync a file.
- *
- * filp may be NULL if called via the msync of a vma.
- */
- 
-int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
-{
-	struct inode * inode = dentry->d_inode;
-	struct super_block * sb;
-	int ret, err;
-
-	/* sync the inode to buffers */
-	ret = write_inode_now(inode, 0);
-
-	/* sync the superblock to buffers */
-	sb = inode->i_sb;
-	lock_super(sb);
-	if (sb->s_op->write_super)
-		sb->s_op->write_super(sb);
-	unlock_super(sb);
-
-	/* .. finally sync the buffers to disk */
-	err = sync_blockdev(sb->s_bdev);
-	if (!ret)
-		ret = err;
-	return ret;
-}
-
-long do_fsync(struct file *file, int datasync)
-{
-	int ret;
-	int err;
-	struct address_space *mapping = file->f_mapping;
-
-	if (!file->f_op || !file->f_op->fsync) {
-		/* Why?  We can still call filemap_fdatawrite */
-		ret = -EINVAL;
-		goto out;
-	}
-
-	ret = filemap_fdatawrite(mapping);
-
-	/*
-	 * We need to protect against concurrent writers, which could cause
-	 * livelocks in fsync_buffers_list().
-	 */
-	mutex_lock(&mapping->host->i_mutex);
-	err = file->f_op->fsync(file, file->f_dentry, datasync);
-	if (!ret)
-		ret = err;
-	mutex_unlock(&mapping->host->i_mutex);
-	err = filemap_fdatawait(mapping);
-	if (!ret)
-		ret = err;
-out:
-	return ret;
-}
-
-static long __do_fsync(unsigned int fd, int datasync)
-{
-	struct file *file;
-	int ret = -EBADF;
-
-	file = fget(fd);
-	if (file) {
-		ret = do_fsync(file, datasync);
-		fput(file);
-	}
-	return ret;
-}
-
-asmlinkage long sys_fsync(unsigned int fd)
-{
-	return __do_fsync(fd, 0);
-}
-
-asmlinkage long sys_fdatasync(unsigned int fd)
-{
-	return __do_fsync(fd, 1);
-}
-
-/*
  * Various filesystems appear to want __find_get_block to be non-blocking.
  * But it's the page lock which protects the buffers.  To get around this,
  * we get exclusion from try_to_free_buffers with the blockdev mapping's
@@ -589,6 +452,7 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 			       bdevname(bh->b_bdev, b));
 		}
 		set_bit(AS_EIO, &page->mapping->flags);
+		set_buffer_write_io_error(bh);
 		clear_buffer_uptodate(bh);
 		SetPageError(page);
 	}
@@ -708,6 +572,10 @@ EXPORT_SYMBOL(mark_buffer_async_write);
 static inline void __remove_assoc_queue(struct buffer_head *bh)
 {
 	list_del_init(&bh->b_assoc_buffers);
+	WARN_ON(!bh->b_assoc_map);
+	if (buffer_write_io_error(bh))
+		set_bit(AS_EIO, &bh->b_assoc_map->flags);
+	bh->b_assoc_map = NULL;
 }
 
 int inode_has_buffers(struct inode *inode)
@@ -806,6 +674,7 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 		spin_lock(&buffer_mapping->private_lock);
 		list_move_tail(&bh->b_assoc_buffers,
 				&mapping->private_list);
+		bh->b_assoc_map = mapping;
 		spin_unlock(&buffer_mapping->private_lock);
 	}
 }
@@ -838,7 +707,10 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
  */
 int __set_page_dirty_buffers(struct page *page)
 {
-	struct address_space * const mapping = page->mapping;
+	struct address_space * const mapping = page_mapping(page);
+
+	if (unlikely(!mapping))
+		return !TestSetPageDirty(page);
 
 	spin_lock(&mapping->private_lock);
 	if (page_has_buffers(page)) {
@@ -899,7 +771,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 	spin_lock(lock);
 	while (!list_empty(list)) {
 		bh = BH_ENTRY(list->next);
-		list_del_init(&bh->b_assoc_buffers);
+		__remove_assoc_queue(bh);
 		if (buffer_dirty(bh) || buffer_locked(bh)) {
 			list_add(&bh->b_assoc_buffers, &tmp);
 			if (buffer_dirty(bh)) {
@@ -920,7 +792,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 
 	while (!list_empty(&tmp)) {
 		bh = BH_ENTRY(tmp.prev);
-		__remove_assoc_queue(bh);
+		list_del_init(&bh->b_assoc_buffers);
 		get_bh(bh);
 		spin_unlock(lock);
 		wait_on_buffer(bh);
@@ -1176,8 +1048,21 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
 	} while ((size << sizebits) < PAGE_SIZE);
 
 	index = block >> sizebits;
-	block = index << sizebits;
 
+	/*
+	 * Check for a block which wants to lie outside our maximum possible
+	 * pagecache index.  (this comparison is done using sector_t types).
+	 */
+	if (unlikely(index != block >> sizebits)) {
+		char b[BDEVNAME_SIZE];
+
+		printk(KERN_ERR "%s: requested out-of-range block %llu for "
+			"device %s\n",
+			__FUNCTION__, (unsigned long long)block,
+			bdevname(bdev, b));
+		return -EIO;
+	}
+	block = index << sizebits;
 	/* Create a page with the proper size buffers.. */
 	page = grow_dev_page(bdev, block, index, size);
 	if (!page)
@@ -1204,12 +1089,16 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
 
 	for (;;) {
 		struct buffer_head * bh;
+		int ret;
 
 		bh = __find_get_block(bdev, block, size);
 		if (bh)
 			return bh;
 
-		if (!grow_buffers(bdev, block, size))
+		ret = grow_buffers(bdev, block, size);
+		if (ret < 0)
+			return NULL;
+		if (ret == 0)
 			free_more_memory();
 	}
 }
@@ -1284,6 +1173,7 @@ void __bforget(struct buffer_head *bh)
 
 		spin_lock(&buffer_mapping->private_lock);
 		list_del_init(&bh->b_assoc_buffers);
+		bh->b_assoc_map = NULL;
 		spin_unlock(&buffer_mapping->private_lock);
 	}
 	__brelse(bh);
@@ -1551,35 +1441,6 @@ static void discard_buffer(struct buffer_head * bh)
 }
 
 /**
- * try_to_release_page() - release old fs-specific metadata on a page
- *
- * @page: the page which the kernel is trying to free
- * @gfp_mask: memory allocation flags (and I/O mode)
- *
- * The address_space is to try to release any data against the page
- * (presumably at page->private).  If the release was successful, return `1'.
- * Otherwise return zero.
- *
- * The @gfp_mask argument specifies whether I/O may be performed to release
- * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
- *
- * NOTE: @gfp_mask may go away, and this function may become non-blocking.
- */
-int try_to_release_page(struct page *page, gfp_t gfp_mask)
-{
-	struct address_space * const mapping = page->mapping;
-
-	BUG_ON(!PageLocked(page));
-	if (PageWriteback(page))
-		return 0;
-	
-	if (mapping && mapping->a_ops->releasepage)
-		return mapping->a_ops->releasepage(page, gfp_mask);
-	return try_to_free_buffers(page);
-}
-EXPORT_SYMBOL(try_to_release_page);
-
-/**
  * block_invalidatepage - invalidate part of all of a buffer-backed page
  *
  * @page: the page which is affected
@@ -1630,14 +1491,6 @@ out:
 }
 EXPORT_SYMBOL(block_invalidatepage);
 
-void do_invalidatepage(struct page *page, unsigned long offset)
-{
-	void (*invalidatepage)(struct page *, unsigned long);
-	invalidatepage = page->mapping->a_ops->invalidatepage ? :
-		block_invalidatepage;
-	(*invalidatepage)(page, offset);
-}
-
 /*
  * We attach and possibly dirty the buffers atomically wrt
  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
@@ -2008,6 +1861,7 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
 			clear_buffer_new(bh);
 			kaddr = kmap_atomic(page, KM_USER0);
 			memset(kaddr+block_start, 0, bh->b_size);
+			flush_dcache_page(page);
 			kunmap_atomic(kaddr, KM_USER0);
 			set_buffer_uptodate(bh);
 			mark_buffer_dirty(bh);
@@ -2514,6 +2368,7 @@ failed:
 	 */
 	kaddr = kmap_atomic(page, KM_USER0);
 	memset(kaddr, 0, PAGE_CACHE_SIZE);
+	flush_dcache_page(page);
 	kunmap_atomic(kaddr, KM_USER0);
 	SetPageUptodate(page);
 	set_page_dirty(page);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 3483d3cf8087..a885f46ca001 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -19,10 +19,30 @@
 #include <linux/kobj_map.h>
 #include <linux/cdev.h>
 #include <linux/mutex.h>
+#include <linux/backing-dev.h>
 
 #ifdef CONFIG_KMOD
 #include <linux/kmod.h>
 #endif
+#include "internal.h"
+
+/*
+ * capabilities for /dev/mem, /dev/kmem and similar directly mappable character
+ * devices
+ * - permits shared-mmap for read, write and/or exec
+ * - does not permit private mmap in NOMMU mode (can't do COW)
+ * - no readahead or I/O queue unplugging required
+ */
+struct backing_dev_info directly_mappable_cdev_bdi = {
+	.capabilities	= (
+#ifdef CONFIG_MMU
+		/* permit private copies of the data to be taken */
+		BDI_CAP_MAP_COPY |
+#endif
+		/* permit direct mmap, for read, write or exec */
+		BDI_CAP_MAP_DIRECT |
+		BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP),
+};
 
 static struct kobj_map *cdev_map;
 
@@ -109,13 +129,31 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
 
 	for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next)
 		if ((*cp)->major > major ||
-		    ((*cp)->major == major && (*cp)->baseminor >= baseminor))
+		    ((*cp)->major == major &&
+		     (((*cp)->baseminor >= baseminor) ||
+		      ((*cp)->baseminor + (*cp)->minorct > baseminor))))
 			break;
-	if (*cp && (*cp)->major == major &&
-	    (*cp)->baseminor < baseminor + minorct) {
-		ret = -EBUSY;
-		goto out;
+
+	/* Check for overlapping minor ranges.  */
+	if (*cp && (*cp)->major == major) {
+		int old_min = (*cp)->baseminor;
+		int old_max = (*cp)->baseminor + (*cp)->minorct - 1;
+		int new_min = baseminor;
+		int new_max = baseminor + minorct - 1;
+
+		/* New driver overlaps from the left.  */
+		if (new_max >= old_min && new_max <= old_max) {
+			ret = -EBUSY;
+			goto out;
+		}
+
+		/* New driver overlaps from the right.  */
+		if (new_min <= old_max && new_min >= old_min) {
+			ret = -EBUSY;
+			goto out;
+		}
 	}
+
 	cd->next = *cp;
 	*cp = cd;
 	mutex_unlock(&chrdevs_lock);
@@ -146,6 +184,15 @@ __unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct)
 	return cd;
 }
 
+/**
+ * register_chrdev_region() - register a range of device numbers
+ * @from: the first in the desired range of device numbers; must include
+ *        the major number.
+ * @count: the number of consecutive device numbers required
+ * @name: the name of the device or driver.
+ *
+ * Return value is zero on success, a negative error code on failure.
+ */
 int register_chrdev_region(dev_t from, unsigned count, const char *name)
 {
 	struct char_device_struct *cd;
@@ -171,6 +218,17 @@ fail:
 	return PTR_ERR(cd);
 }
 
+/**
+ * alloc_chrdev_region() - register a range of char device numbers
+ * @dev: output parameter for first assigned number
+ * @baseminor: first of the requested range of minor numbers
+ * @count: the number of minor numbers required
+ * @name: the name of the associated device or driver
+ *
+ * Allocates a range of char device numbers.  The major number will be
+ * chosen dynamically, and returned (along with the first minor number)
+ * in @dev.  Returns zero or a negative error code.
+ */
 int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
 			const char *name)
 {
@@ -240,6 +298,15 @@ out2:
 	return err;
 }
 
+/**
+ * unregister_chrdev_region() - return a range of device numbers
+ * @from: the first in the range of numbers to unregister
+ * @count: the number of device numbers to unregister
+ *
+ * This function will unregister a range of @count device numbers,
+ * starting with @from.  The caller should normally be the one who
+ * allocated those numbers in the first place...
+ */
 void unregister_chrdev_region(dev_t from, unsigned count)
 {
 	dev_t to = from + count;
@@ -377,6 +444,16 @@ static int exact_lock(dev_t dev, void *data)
 	return cdev_get(p) ? 0 : -1;
 }
 
+/**
+ * cdev_add() - add a char device to the system
+ * @p: the cdev structure for the device
+ * @dev: the first device number for which this device is responsible
+ * @count: the number of consecutive minor numbers corresponding to this
+ *         device
+ *
+ * cdev_add() adds the device represented by @p to the system, making it
+ * live immediately.  A negative error code is returned on failure.
+ */
 int cdev_add(struct cdev *p, dev_t dev, unsigned count)
 {
 	p->dev = dev;
@@ -389,6 +466,13 @@ static void cdev_unmap(dev_t dev, unsigned count)
 	kobj_unmap(cdev_map, dev, count);
 }
 
+/**
+ * cdev_del() - remove a cdev from the system
+ * @p: the cdev structure to be removed
+ *
+ * cdev_del() removes @p from the system, possibly freeing the structure
+ * itself.
+ */
 void cdev_del(struct cdev *p)
 {
 	cdev_unmap(p->dev, p->count);
@@ -417,6 +501,11 @@ static struct kobj_type ktype_cdev_dynamic = {
 	.release	= cdev_dynamic_release,
 };
 
+/**
+ * cdev_alloc() - allocate a cdev structure
+ *
+ * Allocates and returns a cdev structure, or NULL on failure.
+ */
 struct cdev *cdev_alloc(void)
 {
 	struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL);
@@ -428,6 +517,14 @@ struct cdev *cdev_alloc(void)
 	return p;
 }
 
+/**
+ * cdev_init() - initialize a cdev structure
+ * @cdev: the structure to initialize
+ * @fops: the file_operations for this device
+ *
+ * Initializes @cdev, remembering @fops, making it ready to add to the
+ * system with cdev_add().
+ */
 void cdev_init(struct cdev *cdev, const struct file_operations *fops)
 {
 	memset(cdev, 0, sizeof *cdev);
@@ -461,3 +558,4 @@ EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
 EXPORT_SYMBOL(register_chrdev);
 EXPORT_SYMBOL(unregister_chrdev);
+EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/README b/fs/cifs/README
index 5f0e1bd64fee..432e515431c4 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -269,7 +269,7 @@ A partial list of the supported mount options follows:
 		(gid) mount option is specified.  For the uid (gid) of newly
 		created files and directories, ie files created since 
 		the last mount of the server share, the expected uid 
-		(gid) is cached as as long as the inode remains in 
+		(gid) is cached as long as the inode remains in 
 		memory on the client.   Also note that permission
 		checks (authorization checks) on accesses to a file occur
 		at the server, but there are cases in which an administrator
@@ -375,7 +375,7 @@ A partial list of the supported mount options follows:
 		the local process on newly created files, directories, and
 		devices (create, mkdir, mknod).  If the CIFS Unix Extensions
 		are not negotiated, for newly created files and directories
-		instead of using the default uid and gid specified on the
+		instead of using the default uid and gid specified on
 		the mount, cache the new file's uid and gid locally which means
 		that the uid for the file can change when the inode is
 	        reloaded (or the user remounts the share).
@@ -440,7 +440,7 @@ A partial list of the supported mount options follows:
 		create device files and fifos in a format compatible with
 		Services for Unix (SFU).  In addition retrieve bits 10-12
 		of the mode via the SETFILEBITS extended attribute (as
-		SFU does).  In the future the bottom 9 bits of the mode
+		SFU does).  In the future the bottom 9 bits of the
 		mode also will be emulated using queries of the security
 		descriptor (ACL).
  sign           Must use packet signing (helps avoid unwanted data modification
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index d0776ac2b804..5eff35d6e564 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -31,8 +31,8 @@ struct cifs_sid {
 } __attribute__((packed));
 
 /* everyone */
-extern const struct cifs_sid sid_everyone;
+/* extern const struct cifs_sid sid_everyone;*/
 /* group users */
-extern const struct cifs_sid sid_user;
+/* extern const struct cifs_sid sid_user;*/
 
 #endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsencrypt.h b/fs/cifs/cifsencrypt.h
index 03e359b32861..152fa2dcfc6c 100644
--- a/fs/cifs/cifsencrypt.h
+++ b/fs/cifs/cifsencrypt.h
@@ -27,8 +27,6 @@ extern void mdfour(unsigned char *out, unsigned char *in, int n);
 /* smbdes.c */
 extern void E_P16(unsigned char *p14, unsigned char *p16);
 extern void E_P24(unsigned char *p21, unsigned char *c8, unsigned char *p24);
-extern void D_P16(unsigned char *p14, unsigned char *in, unsigned char *out);
-extern void E_old_pw_hash(unsigned char *, unsigned char *, unsigned char *);
 
 
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index c3ef1c0d0e68..84976cdbe713 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -63,6 +63,7 @@ extern struct task_struct * oplockThread; /* remove sparse warning */
 struct task_struct * oplockThread = NULL;
 extern struct task_struct * dnotifyThread; /* remove sparse warning */
 struct task_struct * dnotifyThread = NULL;
+static struct super_operations cifs_super_ops; 
 unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
 module_param(CIFSMaxBufSize, int, 0);
 MODULE_PARM_DESC(CIFSMaxBufSize,"Network buffer size (not including header). Default: 16384 Range: 8192 to 130048");
@@ -198,10 +199,12 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
     /* Only need to call the old QFSInfo if failed
     on newer one */
     if(rc)
-	rc = CIFSSMBQFSInfo(xid, pTcon, buf);
+	if(pTcon->ses->capabilities & CAP_NT_SMBS)
+		rc = CIFSSMBQFSInfo(xid, pTcon, buf); /* not supported by OS2 */
 
-	/* Old Windows servers do not support level 103, retry with level 
-	   one if old server failed the previous call */ 
+	/* Some old Windows servers also do not support level 103, retry with
+	   older level one if old server failed the previous call or we
+	   bypassed it because we detected that this was an older LANMAN sess */
 	if(rc)
 		rc = SMBOldQFSInfo(xid, pTcon, buf);
 	/*     
@@ -253,7 +256,6 @@ cifs_alloc_inode(struct super_block *sb)
 	file data or metadata */
 	cifs_inode->clientCanCacheRead = FALSE;
 	cifs_inode->clientCanCacheAll = FALSE;
-	cifs_inode->vfs_inode.i_blksize = CIFS_MAX_MSGSIZE;
 	cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
 	cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME;
 	INIT_LIST_HEAD(&cifs_inode->openFileList);
@@ -436,13 +438,21 @@ static void cifs_umount_begin(struct vfsmount * vfsmnt, int flags)
 	return;
 }
 
+#ifdef CONFIG_CIFS_STATS2
+static int cifs_show_stats(struct seq_file *s, struct vfsmount *mnt)
+{
+	/* BB FIXME */
+	return 0;
+}
+#endif
+
 static int cifs_remount(struct super_block *sb, int *flags, char *data)
 {
 	*flags |= MS_NODIRATIME;
 	return 0;
 }
 
-struct super_operations cifs_super_ops = {
+static struct super_operations cifs_super_ops = {
 	.read_inode = cifs_read_inode,
 	.put_super = cifs_put_super,
 	.statfs = cifs_statfs,
@@ -455,6 +465,9 @@ struct super_operations cifs_super_ops = {
 	.show_options = cifs_show_options,
 	.umount_begin   = cifs_umount_begin,
 	.remount_fs = cifs_remount,
+#ifdef CONFIG_CIFS_STATS2
+	.show_stats = cifs_show_stats,
+#endif
 };
 
 static int
@@ -481,25 +494,13 @@ cifs_get_sb(struct file_system_type *fs_type,
 	return simple_set_mnt(mnt, sb);
 }
 
-static ssize_t cifs_file_writev(struct file *file, const struct iovec *iov,
-				unsigned long nr_segs, loff_t *ppos)
-{
-	struct inode *inode = file->f_dentry->d_inode;
-	ssize_t written;
-
-	written = generic_file_writev(file, iov, nr_segs, ppos);
-	if (!CIFS_I(inode)->clientCanCacheAll)
-		filemap_fdatawrite(inode->i_mapping);
-	return written;
-}
-
-static ssize_t cifs_file_aio_write(struct kiocb *iocb, const char __user *buf,
-				   size_t count, loff_t pos)
+static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+				   unsigned long nr_segs, loff_t pos)
 {
 	struct inode *inode = iocb->ki_filp->f_dentry->d_inode;
 	ssize_t written;
 
-	written = generic_file_aio_write(iocb, buf, count, pos);
+	written = generic_file_aio_write(iocb, iov, nr_segs, pos);
 	if (!CIFS_I(inode)->clientCanCacheAll)
 		filemap_fdatawrite(inode->i_mapping);
 	return written;
@@ -508,7 +509,7 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const char __user *buf,
 static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 {
 	/* origin == SEEK_END => we must revalidate the cached file length */
-	if (origin == 2) {
+	if (origin == SEEK_END) {
 		int retval = cifs_revalidate(file->f_dentry);
 		if (retval < 0)
 			return (loff_t)retval;
@@ -578,8 +579,6 @@ struct inode_operations cifs_symlink_inode_ops = {
 const struct file_operations cifs_file_ops = {
 	.read = do_sync_read,
 	.write = do_sync_write,
-	.readv = generic_file_readv,
-	.writev = cifs_file_writev,
 	.aio_read = generic_file_aio_read,
 	.aio_write = cifs_file_aio_write,
 	.open = cifs_open,
@@ -621,8 +620,6 @@ const struct file_operations cifs_file_direct_ops = {
 const struct file_operations cifs_file_nobrl_ops = {
 	.read = do_sync_read,
 	.write = do_sync_write,
-	.readv = generic_file_readv,
-	.writev = cifs_file_writev,
 	.aio_read = generic_file_aio_read,
 	.aio_write = cifs_file_aio_write,
 	.open = cifs_open,
@@ -699,8 +696,7 @@ cifs_init_inodecache(void)
 static void
 cifs_destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(cifs_inode_cachep))
-		printk(KERN_WARNING "cifs_inode_cache: error freeing\n");
+	kmem_cache_destroy(cifs_inode_cachep);
 }
 
 static int
@@ -778,13 +774,9 @@ static void
 cifs_destroy_request_bufs(void)
 {
 	mempool_destroy(cifs_req_poolp);
-	if (kmem_cache_destroy(cifs_req_cachep))
-		printk(KERN_WARNING
-		       "cifs_destroy_request_cache: error not all structures were freed\n");
+	kmem_cache_destroy(cifs_req_cachep);
 	mempool_destroy(cifs_sm_req_poolp);
-	if (kmem_cache_destroy(cifs_sm_req_cachep))
-		printk(KERN_WARNING
-		      "cifs_destroy_request_cache: cifs_small_rq free error\n");
+	kmem_cache_destroy(cifs_sm_req_cachep);
 }
 
 static int
@@ -819,13 +811,8 @@ static void
 cifs_destroy_mids(void)
 {
 	mempool_destroy(cifs_mid_poolp);
-	if (kmem_cache_destroy(cifs_mid_cachep))
-		printk(KERN_WARNING
-		       "cifs_destroy_mids: error not all structures were freed\n");
-
-	if (kmem_cache_destroy(cifs_oplock_cachep))
-		printk(KERN_WARNING
-		       "error not all oplock structures were freed\n");
+	kmem_cache_destroy(cifs_mid_cachep);
+	kmem_cache_destroy(cifs_oplock_cachep);
 }
 
 static int cifs_oplock_thread(void * dummyarg)
@@ -930,7 +917,7 @@ init_cifs(void)
 #ifdef CONFIG_PROC_FS
 	cifs_proc_init();
 #endif
-	INIT_LIST_HEAD(&GlobalServerList);	/* BB not implemented yet */
+/*	INIT_LIST_HEAD(&GlobalServerList);*/	/* BB not implemented yet */
 	INIT_LIST_HEAD(&GlobalSMBSessionList);
 	INIT_LIST_HEAD(&GlobalTreeConnectionList);
 	INIT_LIST_HEAD(&GlobalOplock_Q);
@@ -958,6 +945,7 @@ init_cifs(void)
 	GlobalCurrentXid = 0;
 	GlobalTotalActiveXid = 0;
 	GlobalMaxActiveXid = 0;
+	memset(Local_System_Name, 0, 15);
 	rwlock_init(&GlobalSMBSeslock);
 	spin_lock_init(&GlobalMid_Lock);
 
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index bea875d9a46a..a243f779b363 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -36,7 +36,7 @@ extern const struct address_space_operations cifs_addr_ops;
 extern const struct address_space_operations cifs_addr_ops_smallbuf;
 
 /* Functions related to super block operations */
-extern struct super_operations cifs_super_ops;
+/* extern struct super_operations cifs_super_ops;*/
 extern void cifs_read_inode(struct inode *);
 extern void cifs_delete_inode(struct inode *);
 /* extern void cifs_write_inode(struct inode *); *//* BB not needed yet */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b24006c47df1..74d3ccbb103b 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -153,7 +153,7 @@ struct TCP_Server_Info {
 	char sessid[4];		/* unique token id for this session */
 	/* (returned on Negotiate */
 	int capabilities; /* allow selective disabling of caps by smb sess */
-	__u16 timeZone;
+	int timeAdj;  /* Adjust for difference in server time zone in sec */
 	__u16 CurrentMid;         /* multiplex id - rotating counter */
 	char cryptKey[CIFS_CRYPTO_KEY_SIZE];
 	/* 16th byte of RFC1001 workstation name is always null */
@@ -203,9 +203,14 @@ struct cifsSesInfo {
 	char * domainName;
 	char * password;
 };
-/* session flags */
+/* no more than one of the following three session flags may be set */
 #define CIFS_SES_NT4 1
-
+#define CIFS_SES_OS2 2
+#define CIFS_SES_W9X 4
+/* following flag is set for old servers such as OS2 (and Win95?)
+   which do not negotiate NTLM or POSIX dialects, but instead
+   negotiate one of the older LANMAN dialects */
+#define CIFS_SES_LANMAN 8
 /*
  * there is one of these for each connection to a resource on a particular
  * session 
@@ -512,7 +517,8 @@ require use of the stronger protocol */
  * This list helps improve performance and eliminate the messages indicating
  * that we had a communications error talking to the server in this list. 
  */
-GLOBAL_EXTERN struct servers_not_supported *NotSuppList;	/*@z4a */
+/* Feature not supported */
+/* GLOBAL_EXTERN struct servers_not_supported *NotSuppList; */
 
 /*
  * The following is a hash table of all the users we know about.
@@ -568,7 +574,6 @@ GLOBAL_EXTERN unsigned int lookupCacheEnabled;
 GLOBAL_EXTERN unsigned int extended_security;	/* if on, session setup sent 
 				with more secure ntlmssp2 challenge/resp */
 GLOBAL_EXTERN unsigned int sign_CIFS_PDUs;  /* enable smb packet signing */
-GLOBAL_EXTERN unsigned int secFlags;
 GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/
 GLOBAL_EXTERN unsigned int CIFSMaxBufSize;  /* max size not including hdr */
 GLOBAL_EXTERN unsigned int cifs_min_rcv;    /* min size of big ntwrk buf pool */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 81df2bf8e75a..6df9dadba647 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -26,7 +26,8 @@
 
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 #define LANMAN_PROT 0
-#define CIFS_PROT   1
+#define LANMAN2_PROT 1
+#define CIFS_PROT   2
 #else
 #define CIFS_PROT   0
 #endif
@@ -408,6 +409,8 @@ typedef struct negotiate_req {
 
 /* Dialect index is 13 for LANMAN */
 
+#define MIN_TZ_ADJ (15 * 60) /* minimum grid for timezones in seconds */
+
 typedef struct lanman_neg_rsp {
 	struct smb_hdr hdr;	/* wct = 13 */
 	__le16 DialectIndex;
@@ -417,7 +420,10 @@ typedef struct lanman_neg_rsp {
 	__le16 MaxNumberVcs;
 	__le16 RawMode;
 	__le32 SessionKey;
-	__le32 ServerTime;
+	struct {
+		__le16 Time;
+		__le16 Date;
+	} __attribute__((packed)) SrvTime;
 	__le16 ServerTimeZone;
 	__le16 EncryptionKeyLength;
 	__le16 Reserved;
@@ -674,7 +680,7 @@ typedef union smb_com_tree_disconnect {	/* as an altetnative can use flag on
 typedef struct smb_com_close_req {
 	struct smb_hdr hdr;	/* wct = 3 */
 	__u16 FileID;
-	__u32 LastWriteTime;	/* should be zero */
+	__u32 LastWriteTime;	/* should be zero or -1 */
 	__u16 ByteCount;	/* 0 */
 } __attribute__((packed)) CLOSE_REQ;
 
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index b35c55c3c8bb..f1f8225102f0 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -50,12 +50,12 @@ extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
 extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *,
 			struct kvec *, int /* nvec to send */, 
 			int * /* type of buf returned */ , const int long_op);
-extern int SendReceiveBlockingLock(const unsigned int /* xid */ , struct cifsTconInfo *,
+extern int SendReceiveBlockingLock(const unsigned int /* xid */ , 
+					struct cifsTconInfo *,
 				struct smb_hdr * /* input */ ,
 				struct smb_hdr * /* out */ ,
 				int * /* bytes returned */);
-extern int checkSMBhdr(struct smb_hdr *smb, __u16 mid);
-extern int checkSMB(struct smb_hdr *smb, __u16 mid, int length);
+extern int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length);
 extern int is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *);
 extern int is_size_safe_to_change(struct cifsInodeInfo *);
 extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *);
@@ -80,6 +80,9 @@ extern struct oplock_q_entry * AllocOplockQEntry(struct inode *, u16,
 extern void DeleteOplockQEntry(struct oplock_q_entry *);
 extern struct timespec cifs_NTtimeToUnix(u64 /* utc nanoseconds since 1601 */ );
 extern u64 cifs_UnixTimeToNT(struct timespec);
+extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
+extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
+
 extern int cifs_get_inode_info(struct inode **pinode,
 			const unsigned char *search_path, 
 			FILE_ALL_INFO * pfile_info,
@@ -116,6 +119,7 @@ extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
 extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
 			const unsigned char *searchName,
 			FILE_ALL_INFO * findData,
+			int legacy /* whether to use old info level */,
 			const struct nls_table *nls_codepage, int remap);
 extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
                         const unsigned char *searchName,
@@ -279,8 +283,6 @@ extern void sesInfoFree(struct cifsSesInfo *);
 extern struct cifsTconInfo *tconInfoAlloc(void);
 extern void tconInfoFree(struct cifsTconInfo *);
 
-extern int cifs_reconnect(struct TCP_Server_Info *server);
-
 extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *,__u32 *);
 extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
 			  __u32 *);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 075d8fb3d376..098790eb2aa1 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -46,6 +46,7 @@ static struct {
 } protocols[] = {
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 	{LANMAN_PROT, "\2LM1.2X002"},
+	{LANMAN2_PROT, "\2LANMAN2.1"},
 #endif /* weak password hashing for legacy clients */
 	{CIFS_PROT, "\2NT LM 0.12"}, 
 	{POSIX_PROT, "\2POSIX 2"},
@@ -58,6 +59,7 @@ static struct {
 } protocols[] = {
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 	{LANMAN_PROT, "\2LM1.2X002"},
+	{LANMAN2_PROT, "\2LANMAN2.1"},
 #endif /* weak password hashing for legacy clients */
 	{CIFS_PROT, "\2NT LM 0.12"}, 
 	{BAD_PROT, "\2"}
@@ -67,13 +69,13 @@ static struct {
 /* define the number of elements in the cifs dialect array */
 #ifdef CONFIG_CIFS_POSIX
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-#define CIFS_NUM_PROT 3
+#define CIFS_NUM_PROT 4
 #else
 #define CIFS_NUM_PROT 2
 #endif /* CIFS_WEAK_PW_HASH */
 #else /* not posix */
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-#define CIFS_NUM_PROT 2
+#define CIFS_NUM_PROT 3
 #else
 #define CIFS_NUM_PROT 1
 #endif /* CONFIG_CIFS_WEAK_PW_HASH */
@@ -397,6 +399,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	struct TCP_Server_Info * server;
 	u16 count;
 	unsigned int secFlags;
+	u16 dialect;
 
 	if(ses->server)
 		server = ses->server;
@@ -436,9 +439,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	if (rc != 0) 
 		goto neg_err_exit;
 
-	cFYI(1,("Dialect: %d", pSMBr->DialectIndex));
+	dialect = le16_to_cpu(pSMBr->DialectIndex);
+	cFYI(1,("Dialect: %d", dialect));
 	/* Check wct = 1 error case */
-	if((pSMBr->hdr.WordCount < 13) || (pSMBr->DialectIndex == BAD_PROT)) {
+	if((pSMBr->hdr.WordCount < 13) || (dialect == BAD_PROT)) {
 		/* core returns wct = 1, but we do not ask for core - otherwise
 		small wct just comes when dialect index is -1 indicating we 
 		could not negotiate a common dialect */
@@ -446,7 +450,9 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		goto neg_err_exit;
 #ifdef CONFIG_CIFS_WEAK_PW_HASH 
 	} else if((pSMBr->hdr.WordCount == 13)
-			&& (pSMBr->DialectIndex == LANMAN_PROT)) {
+			&& ((dialect == LANMAN_PROT)
+				|| (dialect == LANMAN2_PROT))) {
+		__s16 tmp;
 		struct lanman_neg_rsp * rsp = (struct lanman_neg_rsp *)pSMBr;
 
 		if((secFlags & CIFSSEC_MAY_LANMAN) || 
@@ -472,12 +478,44 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			server->maxRw = 0;/* we do not need to use raw anyway */
 			server->capabilities = CAP_MPX_MODE;
 		}
-		server->timeZone = le16_to_cpu(rsp->ServerTimeZone);
+		tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
+		if (tmp == -1) {
+			/* OS/2 often does not set timezone therefore
+			 * we must use server time to calc time zone.
+			 * Could deviate slightly from the right zone.
+			 * Smallest defined timezone difference is 15 minutes
+			 * (i.e. Nepal).  Rounding up/down is done to match
+			 * this requirement.
+			 */
+			int val, seconds, remain, result;
+			struct timespec ts, utc;
+			utc = CURRENT_TIME;
+			ts = cnvrtDosUnixTm(le16_to_cpu(rsp->SrvTime.Date),
+						le16_to_cpu(rsp->SrvTime.Time));
+			cFYI(1,("SrvTime: %d sec since 1970 (utc: %d) diff: %d",
+				(int)ts.tv_sec, (int)utc.tv_sec, 
+				(int)(utc.tv_sec - ts.tv_sec)));
+			val = (int)(utc.tv_sec - ts.tv_sec);
+			seconds = val < 0 ? -val : val;
+			result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ;
+			remain = seconds % MIN_TZ_ADJ;
+			if(remain >= (MIN_TZ_ADJ / 2))
+				result += MIN_TZ_ADJ;
+			if(val < 0)
+				result = - result;
+			server->timeAdj = result;
+		} else {
+			server->timeAdj = (int)tmp;
+			server->timeAdj *= 60; /* also in seconds */
+		}
+		cFYI(1,("server->timeAdj: %d seconds", server->timeAdj));
+
 
 		/* BB get server time for time conversions and add
 		code to use it and timezone since this is not UTC */	
 
-		if (rsp->EncryptionKeyLength == cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
+		if (rsp->EncryptionKeyLength == 
+				cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) {
 			memcpy(server->cryptKey, rsp->EncryptionKey,
 				CIFS_CRYPTO_KEY_SIZE);
 		} else if (server->secMode & SECMODE_PW_ENCRYPT) {
@@ -531,7 +569,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	cFYI(0, ("Max buf = %d", ses->server->maxBuf));
 	GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
 	server->capabilities = le32_to_cpu(pSMBr->Capabilities);
-	server->timeZone = le16_to_cpu(pSMBr->ServerTimeZone);	
+	server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
+	server->timeAdj *= 60;
 	if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) {
 		memcpy(server->cryptKey, pSMBr->u.EncryptionKey,
 		       CIFS_CRYPTO_KEY_SIZE);
@@ -1617,7 +1656,7 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 	pSMBr = (CLOSE_RSP *)pSMB; /* BB removeme BB */
 
 	pSMB->FileID = (__u16) smb_file_id;
-	pSMB->LastWriteTime = 0;
+	pSMB->LastWriteTime = 0xFFFFFFFF;
 	pSMB->ByteCount = 0;
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -2773,9 +2812,11 @@ GetExtAttrOut:
 
 
 /* security id for everyone */
-const struct cifs_sid sid_everyone = {1, 1, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0}};
+const static struct cifs_sid sid_everyone = 
+		{1, 1, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0}};
 /* group users */
-const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {32, 545, 0, 0}};
+const static struct cifs_sid sid_user = 
+		{1, 2 , {0, 0, 0, 0, 0, 5}, {32, 545, 0, 0}};
 
 /* Convert CIFS ACL to POSIX form */
 static int parse_sec_desc(struct cifs_sid * psec_desc, int acl_len)
@@ -2856,7 +2897,6 @@ qsec_out:
 	return rc;
 }
 
-
 /* Legacy Query Path Information call for lookup to old servers such
    as Win9x/WinME */
 int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
@@ -2898,7 +2938,16 @@ QInfRetry:
 	if (rc) {
 		cFYI(1, ("Send error in QueryInfo = %d", rc));
 	} else if (pFinfo) {            /* decode response */
+		struct timespec ts;
+		__u32 time = le32_to_cpu(pSMBr->last_write_time);
+		/* BB FIXME - add time zone adjustment BB */
 		memset(pFinfo, 0, sizeof(FILE_ALL_INFO));
+		ts.tv_nsec = 0;
+		ts.tv_sec = time;
+		/* decode time fields */
+		pFinfo->ChangeTime = cpu_to_le64(cifs_UnixTimeToNT(ts));
+		pFinfo->LastWriteTime = pFinfo->ChangeTime;
+		pFinfo->LastAccessTime = 0;
 		pFinfo->AllocationSize =
 			cpu_to_le64(le32_to_cpu(pSMBr->size));
 		pFinfo->EndOfFile = pFinfo->AllocationSize;
@@ -2922,6 +2971,7 @@ int
 CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
 		 const unsigned char *searchName,
 		 FILE_ALL_INFO * pFindData,
+		 int legacy /* old style infolevel */,
 		 const struct nls_table *nls_codepage, int remap)
 {
 /* level 263 SMB_QUERY_FILE_ALL_INFO */
@@ -2970,7 +3020,10 @@ QPathInfoRetry:
 	byte_count = params + 1 /* pad */ ;
 	pSMB->TotalParameterCount = cpu_to_le16(params);
 	pSMB->ParameterCount = pSMB->TotalParameterCount;
-	pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
+	if(legacy)
+		pSMB->InformationLevel = cpu_to_le16(SMB_INFO_STANDARD);
+	else
+		pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
 	pSMB->Reserved4 = 0;
 	pSMB->hdr.smb_buf_length += byte_count;
 	pSMB->ByteCount = cpu_to_le16(byte_count);
@@ -2982,13 +3035,24 @@ QPathInfoRetry:
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		if (rc || (pSMBr->ByteCount < 40)) 
+		if (rc) /* BB add auto retry on EOPNOTSUPP? */
+			rc = -EIO;
+		else if (!legacy && (pSMBr->ByteCount < 40)) 
 			rc = -EIO;	/* bad smb */
+		else if(legacy && (pSMBr->ByteCount < 24))
+			rc = -EIO;  /* 24 or 26 expected but we do not read last field */
 		else if (pFindData){
+			int size;
 			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
+			if(legacy) /* we do not read the last field, EAsize, fortunately
+					   since it varies by subdialect and on Set vs. Get, is  
+					   two bytes or 4 bytes depending but we don't care here */
+				size = sizeof(FILE_INFO_STANDARD);
+			else
+				size = sizeof(FILE_ALL_INFO);
 			memcpy((char *) pFindData,
 			       (char *) &pSMBr->hdr.Protocol +
-			       data_offset, sizeof (FILE_ALL_INFO));
+			       data_offset, size);
 		} else
 		    rc = -ENOMEM;
 	}
@@ -3613,6 +3677,14 @@ getDFSRetry:
 		strncpy(pSMB->RequestFileName, searchName, name_len);
 	}
 
+	if(ses->server) {
+		if(ses->server->secMode &
+		   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
+			pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+	}
+
+        pSMB->hdr.Uid = ses->Suid;
+
 	params = 2 /* level */  + name_len /*includes null */ ;
 	pSMB->TotalDataCount = 0;
 	pSMB->DataCount = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 0e9ba0b9d71e..4093d5332930 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -109,7 +109,7 @@ static int ipv6_connect(struct sockaddr_in6 *psin_server,
 	 * wake up waiters on reconnection? - (not needed currently)
 	 */
 
-int
+static int
 cifs_reconnect(struct TCP_Server_Info *server)
 {
 	int rc = 0;
@@ -771,13 +771,18 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
 	separator[0] = ',';
 	separator[1] = 0; 
 
-	memset(vol->source_rfc1001_name,0x20,15);
-	for(i=0;i < strnlen(system_utsname.nodename,15);i++) {
-		/* does not have to be a perfect mapping since the field is
-		informational, only used for servers that do not support
-		port 445 and it can be overridden at mount time */
-		vol->source_rfc1001_name[i] = 
-			toupper(system_utsname.nodename[i]);
+	if (Local_System_Name[0] != 0)
+		memcpy(vol->source_rfc1001_name, Local_System_Name,15);
+	else {
+		char *nodename = utsname()->nodename;
+		int n = strnlen(nodename,15);
+		memset(vol->source_rfc1001_name,0x20,15);
+		for(i=0 ; i < n ; i++) {
+			/* does not have to be perfect mapping since field is
+			informational, only used for servers that do not support
+			port 445 and it can be overridden at mount time */
+			vol->source_rfc1001_name[i] = toupper(nodename[i]);
+		}
 	}
 	vol->source_rfc1001_name[15] = 0;
 	/* null target name indicates to use *SMBSERVR default called name
@@ -2153,7 +2158,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 				  32, nls_codepage);
 		bcc_ptr += 2 * bytes_returned;
 		bytes_returned =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release,
+		    cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release,
 				  32, nls_codepage);
 		bcc_ptr += 2 * bytes_returned;
 		bcc_ptr += 2;
@@ -2180,8 +2185,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 		}
 		strcpy(bcc_ptr, "Linux version ");
 		bcc_ptr += strlen("Linux version ");
-		strcpy(bcc_ptr, system_utsname.release);
-		bcc_ptr += strlen(system_utsname.release) + 1;
+		strcpy(bcc_ptr, utsname()->release);
+		bcc_ptr += strlen(utsname()->release) + 1;
 		strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
 		bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
 	}
@@ -2445,7 +2450,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 				  32, nls_codepage);
 		bcc_ptr += 2 * bytes_returned;
 		bytes_returned =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, 32,
+		    cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32,
 				  nls_codepage);
 		bcc_ptr += 2 * bytes_returned;
 		bcc_ptr += 2;	/* null terminate Linux version */
@@ -2462,8 +2467,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
 	} else {		/* ASCII */
 		strcpy(bcc_ptr, "Linux version ");
 		bcc_ptr += strlen("Linux version ");
-		strcpy(bcc_ptr, system_utsname.release);
-		bcc_ptr += strlen(system_utsname.release) + 1;
+		strcpy(bcc_ptr, utsname()->release);
+		bcc_ptr += strlen(utsname()->release) + 1;
 		strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
 		bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
 		bcc_ptr++;	/* empty domain field */
@@ -2836,7 +2841,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 				  32, nls_codepage);
 		bcc_ptr += 2 * bytes_returned;
 		bytes_returned =
-		    cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, 32,
+		    cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32,
 				  nls_codepage);
 		bcc_ptr += 2 * bytes_returned;
 		bcc_ptr += 2;	/* null term version string */
@@ -2888,8 +2893,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 
 		strcpy(bcc_ptr, "Linux version ");
 		bcc_ptr += strlen("Linux version ");
-		strcpy(bcc_ptr, system_utsname.release);
-		bcc_ptr += strlen(system_utsname.release) + 1;
+		strcpy(bcc_ptr, utsname()->release);
+		bcc_ptr += strlen(utsname()->release) + 1;
 		strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
 		bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
 		bcc_ptr++;	/* null domain */
@@ -3215,7 +3220,9 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 			}
 			/* else do not bother copying these informational fields */
 		}
-		if(smb_buffer_response->WordCount == 3)
+		if((smb_buffer_response->WordCount == 3) ||
+			 (smb_buffer_response->WordCount == 7))
+			/* field is in same location */
 			tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport);
 		else
 			tcon->Flags = 0;
@@ -3312,19 +3319,21 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 		first_time = 1;
 	}
 	if (!rc) {
+		pSesInfo->flags = 0;
 		pSesInfo->capabilities = pSesInfo->server->capabilities;
 		if(linuxExtEnabled == 0)
 			pSesInfo->capabilities &= (~CAP_UNIX);
 	/*	pSesInfo->sequence_number = 0;*/
-		cFYI(1,("Security Mode: 0x%x Capabilities: 0x%x Time Zone: %d",
+		cFYI(1,("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
 			pSesInfo->server->secMode,
 			pSesInfo->server->capabilities,
-			pSesInfo->server->timeZone));
+			pSesInfo->server->timeAdj));
 		if(experimEnabled < 2)
 			rc = CIFS_SessSetup(xid, pSesInfo,
 					    first_time, nls_info);
 		else if (extended_security
-				&& (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
+				&& (pSesInfo->capabilities 
+					& CAP_EXTENDED_SECURITY)
 				&& (pSesInfo->server->secType == NTLMSSP)) {
 			rc = -EOPNOTSUPP;
 		} else if (extended_security
@@ -3338,7 +3347,7 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
 			if (!rc) {
 				if(ntlmv2_flag) {
 					char * v2_response;
-					cFYI(1,("Can use more secure NTLM version 2 password hash"));
+					cFYI(1,("more secure NTLM ver2 hash"));
 					if(CalcNTLMv2_partial_mac_key(pSesInfo, 
 						nls_info)) {
 						rc = -ENOMEM;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ddb012a68023..976a691c5a68 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -25,7 +25,6 @@
 #include <linux/backing-dev.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
-#include <linux/mpage.h>
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
 #include <linux/smp_lock.h>
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index b88147c1dc27..35d54bb0869a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -19,7 +19,6 @@
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #include <linux/fs.h>
-#include <linux/buffer_head.h>
 #include <linux/stat.h>
 #include <linux/pagemap.h>
 #include <asm/div64.h>
@@ -338,6 +337,7 @@ int cifs_get_inode_info(struct inode **pinode,
 		pfindData = (FILE_ALL_INFO *)buf;
 		/* could do find first instead but this returns more info */
 		rc = CIFSSMBQPathInfo(xid, pTcon, search_path, pfindData,
+			      0 /* not legacy */,
 			      cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
 				CIFS_MOUNT_MAP_SPECIAL_CHR);
 		/* BB optimize code so we do not make the above call
@@ -385,8 +385,10 @@ int cifs_get_inode_info(struct inode **pinode,
 		/* get new inode */
 		if (*pinode == NULL) {
 			*pinode = new_inode(sb);
-			if (*pinode == NULL)
+			if (*pinode == NULL) {
+				kfree(buf);
 				return -ENOMEM;
+			}
 			/* Is an i_ino of zero legal? Can we use that to check
 			   if the server supports returning inode numbers?  Are
 			   there other sanity checks we can use to ensure that
@@ -432,8 +434,11 @@ int cifs_get_inode_info(struct inode **pinode,
 		(pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/
 
 		/* Linux can not store file creation time so ignore it */
-		inode->i_atime =
-		    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
+		if(pfindData->LastAccessTime)
+			inode->i_atime = cifs_NTtimeToUnix
+				(le64_to_cpu(pfindData->LastAccessTime));
+		else /* do not need to use current_fs_time - time not stored */
+			inode->i_atime = CURRENT_TIME;
 		inode->i_mtime =
 		    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
 		inode->i_ctime =
@@ -591,7 +596,7 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
 
 	if (!rc) {
 		if (direntry->d_inode)
-			direntry->d_inode->i_nlink--;
+			drop_nlink(direntry->d_inode);
 	} else if (rc == -ENOENT) {
 		d_drop(direntry);
 	} else if (rc == -ETXTBSY) {
@@ -610,7 +615,7 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 			CIFSSMBClose(xid, pTcon, netfid);
 			if (direntry->d_inode)
-				direntry->d_inode->i_nlink--;
+				drop_nlink(direntry->d_inode);
 		}
 	} else if (rc == -EACCES) {
 		/* try only if r/o attribute set in local lookup data? */
@@ -664,7 +669,7 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 			if (!rc) {
 				if (direntry->d_inode)
-					direntry->d_inode->i_nlink--;
+					drop_nlink(direntry->d_inode);
 			} else if (rc == -ETXTBSY) {
 				int oplock = FALSE;
 				__u16 netfid;
@@ -685,7 +690,7 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
 						    CIFS_MOUNT_MAP_SPECIAL_CHR);
 					CIFSSMBClose(xid, pTcon, netfid);
 					if (direntry->d_inode)
-			                        direntry->d_inode->i_nlink--;
+						drop_nlink(direntry->d_inode);
 				}
 			/* BB if rc = -ETXTBUSY goto the rename logic BB */
 			}
@@ -736,7 +741,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 		cFYI(1, ("cifs_mkdir returned 0x%x", rc));
 		d_drop(direntry);
 	} else {
-		inode->i_nlink++;
+		inc_nlink(inode);
 		if (pTcon->ses->capabilities & CAP_UNIX)
 			rc = cifs_get_inode_info_unix(&newinode, full_path,
 						      inode->i_sb,xid);
@@ -817,9 +822,9 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 			  cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
 
 	if (!rc) {
-		inode->i_nlink--;
+		drop_nlink(inode);
 		i_size_write(direntry->d_inode,0);
-		direntry->d_inode->i_nlink = 0;
+		clear_nlink(direntry->d_inode);
 	}
 
 	cifsInode = CIFS_I(direntry->d_inode);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index b0ea6687ab55..e34c7db00f6f 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -22,7 +22,6 @@
  */
 
 #include <linux/fs.h>
-#include <linux/ext2_fs.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
@@ -74,7 +73,7 @@ int cifs_ioctl (struct inode * inode, struct file * filep,
 			}
 			break;
 #ifdef CONFIG_CIFS_POSIX
-		case EXT2_IOC_GETFLAGS:
+		case FS_IOC_GETFLAGS:
 			if(CIFS_UNIX_EXTATTR_CAP & caps) {
 				if (pSMBFile == NULL)
 					break;
@@ -82,12 +81,12 @@ int cifs_ioctl (struct inode * inode, struct file * filep,
 					&ExtAttrBits, &ExtAttrMask);
 				if(rc == 0)
 					rc = put_user(ExtAttrBits &
-						EXT2_FL_USER_VISIBLE,
+						FS_FL_USER_VISIBLE,
 						(int __user *)arg);
 			}
 			break;
 
-		case EXT2_IOC_SETFLAGS:
+		case FS_IOC_SETFLAGS:
 			if(CIFS_UNIX_EXTATTR_CAP & caps) {
 				if(get_user(ExtAttrBits,(int __user *)arg)) {
 					rc = -EFAULT;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index a57f5d6e6213..0bee8b7e521a 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -254,7 +254,11 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
 				tmpbuffer,
 				len - 1,
 				cifs_sb->local_nls);
-	else {
+	else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
+		cERROR(1,("SFU style symlinks not implemented yet"));
+		/* add open and read as in fs/cifs/inode.c */
+	
+	} else {
 		rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, GENERIC_READ,
 				OPEN_REPARSE_POINT,&fid, &oplock, NULL, 
 				cifs_sb->local_nls, 
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
index 7aa23490541f..ccebf9b7eb86 100644
--- a/fs/cifs/md5.c
+++ b/fs/cifs/md5.c
@@ -252,10 +252,11 @@ MD5Transform(__u32 buf[4], __u32 const in[16])
 	buf[3] += d;
 }
 
+#if 0   /* currently unused */
 /***********************************************************************
  the rfc 2104 version of hmac_md5 initialisation.
 ***********************************************************************/
-void
+static void
 hmac_md5_init_rfc2104(unsigned char *key, int key_len,
 		      struct HMACMD5Context *ctx)
 {
@@ -289,6 +290,7 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len,
 	MD5Init(&ctx->ctx);
 	MD5Update(&ctx->ctx, ctx->k_ipad, 64);
 }
+#endif
 
 /***********************************************************************
  the microsoft version of hmac_md5 initialisation.
@@ -350,7 +352,8 @@ hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx)
  single function to calculate an HMAC MD5 digest from data.
  use the microsoft hmacmd5 init method because the key is 16 bytes.
 ************************************************************/
-void
+#if 0 /* currently unused */
+static void
 hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
 	 unsigned char *digest)
 {
@@ -361,3 +364,4 @@ hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
 	}
 	hmac_md5_final(digest, &ctx);
 }
+#endif
diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h
index 00e1c5394fe1..f7d4f4197bac 100644
--- a/fs/cifs/md5.h
+++ b/fs/cifs/md5.h
@@ -27,12 +27,12 @@ void MD5Final(unsigned char digest[16], struct MD5Context *context);
 
 /* The following definitions come from lib/hmacmd5.c  */
 
-void hmac_md5_init_rfc2104(unsigned char *key, int key_len,
-			struct HMACMD5Context *ctx);
+/* void hmac_md5_init_rfc2104(unsigned char *key, int key_len,
+			struct HMACMD5Context *ctx);*/
 void hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
 			struct HMACMD5Context *ctx);
 void hmac_md5_update(const unsigned char *text, int text_len,
 			struct HMACMD5Context *ctx);
 void hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx);
-void hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
-			unsigned char *digest);
+/* void hmac_md5(unsigned char key[16], unsigned char *data, int data_len,
+			unsigned char *digest);*/
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 22c937e5884f..bbc9cd34b6ea 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -389,7 +389,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
 	return;
 }
 
-int
+static int
 checkSMBhdr(struct smb_hdr *smb, __u16 mid)
 {
 	/* Make sure that this really is an SMB, that it is a response, 
@@ -418,26 +418,42 @@ checkSMBhdr(struct smb_hdr *smb, __u16 mid)
 }
 
 int
-checkSMB(struct smb_hdr *smb, __u16 mid, int length)
+checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
 {
 	__u32 len = smb->smb_buf_length;
 	__u32 clc_len;  /* calculated length */
 	cFYI(0, ("checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len));
-	if (((unsigned int)length < 2 + sizeof (struct smb_hdr)) ||
-	    (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)) {
-		if ((unsigned int)length < 2 + sizeof (struct smb_hdr)) {
-			if (((unsigned int)length >= 
-				sizeof (struct smb_hdr) - 1)
+
+	if (length < 2 + sizeof (struct smb_hdr)) {
+		if ((length >= sizeof (struct smb_hdr) - 1)
 			    && (smb->Status.CifsError != 0)) {
-				smb->WordCount = 0;
-				/* some error cases do not return wct and bcc */
+			smb->WordCount = 0;
+			/* some error cases do not return wct and bcc */
+			return 0;
+		} else if ((length == sizeof(struct smb_hdr) + 1) && 
+				(smb->WordCount == 0)) {
+			char * tmp = (char *)smb;
+			/* Need to work around a bug in two servers here */
+			/* First, check if the part of bcc they sent was zero */
+			if (tmp[sizeof(struct smb_hdr)] == 0) {
+				/* some servers return only half of bcc
+				 * on simple responses (wct, bcc both zero)
+				 * in particular have seen this on
+				 * ulogoffX and FindClose. This leaves
+				 * one byte of bcc potentially unitialized
+				 */
+				/* zero rest of bcc */
+				tmp[sizeof(struct smb_hdr)+1] = 0;
 				return 0;
-			} else {
-				cERROR(1, ("Length less than smb header size"));
 			}
+			cERROR(1,("rcvd invalid byte count (bcc)"));
+		} else {
+			cERROR(1, ("Length less than smb header size"));
 		}
-		if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)
-			cERROR(1, ("smb length greater than MaxBufSize, mid=%d",
+		return 1;
+	}
+	if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+		cERROR(1, ("smb length greater than MaxBufSize, mid=%d",
 				   smb->Mid));
 		return 1;
 	}
@@ -446,7 +462,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, int length)
 		return 1;
 	clc_len = smbCalcSize_LE(smb);
 
-	if(4 + len != (unsigned int)length) {
+	if(4 + len != length) {
 		cERROR(1, ("Length read does not match RFC1001 length %d",len));
 		return 1;
 	}
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index ce87550e918f..992e80edc720 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -909,3 +909,61 @@ cifs_UnixTimeToNT(struct timespec t)
 	/* Convert to 100ns intervals and then add the NTFS time offset. */
 	return (u64) t.tv_sec * 10000000 + t.tv_nsec/100 + NTFS_TIME_OFFSET;
 }
+
+static int total_days_of_prev_months[] =
+{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334};
+
+
+__le64 cnvrtDosCifsTm(__u16 date, __u16 time)
+{
+	return cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm(date, time)));
+}
+
+struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
+{
+	struct timespec ts;
+	int sec, min, days, month, year;
+	SMB_TIME * st = (SMB_TIME *)&time;
+	SMB_DATE * sd = (SMB_DATE *)&date;
+
+	cFYI(1,("date %d time %d",date, time));
+
+	sec = 2 * st->TwoSeconds;
+	min = st->Minutes;
+	if((sec > 59) || (min > 59))
+		cERROR(1,("illegal time min %d sec %d", min, sec));
+	sec += (min * 60);
+	sec += 60 * 60 * st->Hours;
+	if(st->Hours > 24)
+		cERROR(1,("illegal hours %d",st->Hours));
+	days = sd->Day;
+	month = sd->Month;
+	if((days > 31) || (month > 12))
+		cERROR(1,("illegal date, month %d day: %d", month, days));
+	month -= 1;
+	days += total_days_of_prev_months[month];
+	days += 3652; /* account for difference in days between 1980 and 1970 */
+	year = sd->Year;
+	days += year * 365;
+	days += (year/4); /* leap year */
+	/* generalized leap year calculation is more complex, ie no leap year
+	for years/100 except for years/400, but since the maximum number for DOS
+	 year is 2**7, the last year is 1980+127, which means we need only
+	 consider 2 special case years, ie the years 2000 and 2100, and only
+	 adjust for the lack of leap year for the year 2100, as 2000 was a 
+	 leap year (divisable by 400) */
+	if(year >= 120)  /* the year 2100 */
+		days = days - 1;  /* do not count leap year for the year 2100 */
+
+	/* adjust for leap year where we are still before leap day */
+	if(year != 120)
+		days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0);
+	sec += 24 * 60 * 60 * days; 
+
+	ts.tv_sec = sec;
+
+	/* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */
+
+	ts.tv_nsec = 0;
+	return ts;
+} 
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 9aeb58a7d369..b5b0a2a41bef 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -106,6 +106,17 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
 	return rc;
 }
 
+static void AdjustForTZ(struct cifsTconInfo * tcon, struct inode * inode)
+{
+	if((tcon) && (tcon->ses) && (tcon->ses->server)) {
+		inode->i_ctime.tv_sec += tcon->ses->server->timeAdj;
+		inode->i_mtime.tv_sec += tcon->ses->server->timeAdj;
+		inode->i_atime.tv_sec += tcon->ses->server->timeAdj;
+	}
+	return;
+}
+
+
 static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 		char * buf, int *pobject_type, int isNewInode)
 {
@@ -135,16 +146,23 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 		tmp_inode->i_ctime =
 		      cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
 	} else { /* legacy, OS2 and DOS style */
+/*		struct timespec ts;*/
 		FIND_FILE_STANDARD_INFO * pfindData = 
 			(FIND_FILE_STANDARD_INFO *)buf;
 
+		tmp_inode->i_mtime = cnvrtDosUnixTm(
+				le16_to_cpu(pfindData->LastWriteDate),
+				le16_to_cpu(pfindData->LastWriteTime));
+		tmp_inode->i_atime = cnvrtDosUnixTm(
+				le16_to_cpu(pfindData->LastAccessDate),
+				le16_to_cpu(pfindData->LastAccessTime));
+                tmp_inode->i_ctime = cnvrtDosUnixTm(
+                                le16_to_cpu(pfindData->LastWriteDate),
+                                le16_to_cpu(pfindData->LastWriteTime));
+		AdjustForTZ(cifs_sb->tcon, tmp_inode);
 		attr = le16_to_cpu(pfindData->Attributes);
 		allocation_size = le32_to_cpu(pfindData->AllocationSize);
 		end_of_file = le32_to_cpu(pfindData->DataSize);
-		tmp_inode->i_atime = CURRENT_TIME;
-		/* tmp_inode->i_mtime =  BB FIXME - add dos time handling
-		tmp_inode->i_ctime = 0;   BB FIXME */
-
 	}
 
 	/* Linux can not store file creation time unfortunately so ignore it */
@@ -216,10 +234,9 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 
 	if (allocation_size < end_of_file)
 		cFYI(1, ("May be sparse file, allocation less than file size"));
-	cFYI(1, ("File Size %ld and blocks %llu and blocksize %ld",
+	cFYI(1, ("File Size %ld and blocks %llu",
 		(unsigned long)tmp_inode->i_size,
-		(unsigned long long)tmp_inode->i_blocks,
-		tmp_inode->i_blksize));
+		(unsigned long long)tmp_inode->i_blocks));
 	if (S_ISREG(tmp_inode->i_mode)) {
 		cFYI(1, ("File inode"));
 		tmp_inode->i_op = &cifs_file_inode_ops;
@@ -939,6 +956,7 @@ static int cifs_save_resume_key(const char *current_entry,
 		filename = &pFindData->FileName[0];
 		/* one byte length, no name conversion */
 		len = (unsigned int)pFindData->FileNameLength;
+		cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
 	} else {
 		cFYI(1,("Unknown findfirst level %d",level));
 		return -EINVAL;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index d1705ab8136e..a8a083543ba0 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -111,7 +111,7 @@ static void unicode_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses,
 	bytes_ret = cifs_strtoUCS((__le16 *)bcc_ptr, "Linux version ", 32,
 				  nls_cp);
 	bcc_ptr += 2 * bytes_ret;
-	bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release,
+	bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, init_utsname()->release,
 				  32, nls_cp);
 	bcc_ptr += 2 * bytes_ret;
 	bcc_ptr += 2; /* trailing null */
@@ -158,8 +158,8 @@ static void ascii_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses,
 
 	strcpy(bcc_ptr, "Linux version ");
 	bcc_ptr += strlen("Linux version ");
-	strcpy(bcc_ptr, system_utsname.release);
-	bcc_ptr += strlen(system_utsname.release) + 1;
+	strcpy(bcc_ptr, init_utsname()->release);
+	bcc_ptr += strlen(init_utsname()->release) + 1;
 
 	strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
 	bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
@@ -268,6 +268,10 @@ static int decode_ascii_ssetup(char ** pbcc_area, int bleft, struct cifsSesInfo
 	ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
 	if(ses->serverOS)
 		strncpy(ses->serverOS, bcc_ptr, len);
+	if(strncmp(ses->serverOS, "OS/2",4) == 0) {
+			cFYI(1,("OS/2 server"));
+			ses->flags |= CIFS_SES_OS2;
+	}
 
 	bcc_ptr += len + 1;
 	bleft -= len + 1;
@@ -290,16 +294,11 @@ static int decode_ascii_ssetup(char ** pbcc_area, int bleft, struct cifsSesInfo
         if(len > bleft)
                 return rc;
 
-        if(ses->serverDomain)
-                kfree(ses->serverDomain);
-
-        ses->serverDomain = kzalloc(len + 1, GFP_KERNEL);
-        if(ses->serverOS)
-                strncpy(ses->serverOS, bcc_ptr, len);
-
-        bcc_ptr += len + 1;
-	bleft -= len + 1;
-
+	/* No domain field in LANMAN case. Domain is
+	   returned by old servers in the SMB negprot response */
+	/* BB For newer servers which do not support Unicode,
+	   but thus do return domain here we could add parsing
+	   for it later, but it is not very important */
 	cFYI(1,("ascii: bytes left %d",bleft));
 
 	return rc;
@@ -366,6 +365,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 	str_area = kmalloc(2000, GFP_KERNEL);
 	bcc_ptr = str_area;
 
+	ses->flags &= ~CIFS_SES_LANMAN;
+
 	if(type == LANMAN) {
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 		char lnm_session_key[CIFS_SESS_KEY_SIZE];
@@ -377,7 +378,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
 		/* and copy into bcc */
 
 		calc_lanman_hash(ses, lnm_session_key);
-
+		ses->flags |= CIFS_SES_LANMAN; 
 /* #ifdef CONFIG_CIFS_DEBUG2
 		cifs_dump_mem("cryptkey: ",ses->server->cryptKey,
 			CIFS_SESS_KEY_SIZE);
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index efaa044523a7..7a1b2b961ec8 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -364,20 +364,20 @@ E_P24(unsigned char *p21, unsigned char *c8, unsigned char *p24)
 	smbhash(p24 + 16, c8, p21 + 14, 1);
 }
 
-void
+#if 0 /* currently unsued */
+static void
 D_P16(unsigned char *p14, unsigned char *in, unsigned char *out)
 {
 	smbhash(out, in, p14, 0);
 	smbhash(out + 8, in + 8, p14 + 7, 0);
 }
 
-void
+static void
 E_old_pw_hash(unsigned char *p14, unsigned char *in, unsigned char *out)
 {
 	smbhash(out, in, p14, 1);
 	smbhash(out + 8, in + 8, p14 + 7, 1);
 }
-#if 0
 /* these routines are currently unneeded, but may be
 	needed later */
 void
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index f518c5e45035..4b25ba92180d 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -51,11 +51,8 @@
 
 void SMBencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24);
 void E_md4hash(const unsigned char *passwd, unsigned char *p16);
-void nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16]);
 static void SMBOWFencrypt(unsigned char passwd[16], unsigned char *c8,
 		   unsigned char p24[24]);
-void NTLMSSPOWFencrypt(unsigned char passwd[8],
-		       unsigned char *ntlmchalresp, unsigned char p24[24]);
 void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24);
 
 /*
@@ -144,8 +141,9 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16)
 	memset(wpwd,0,129 * 2);
 }
 
+#if 0 /* currently unused */
 /* Does both the NT and LM owfs of a user's password */
-void
+static void
 nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16])
 {
 	char passwd[514];
@@ -171,6 +169,7 @@ nt_lm_owf_gen(char *pwd, unsigned char nt_p16[16], unsigned char p16[16])
 	/* clear out local copy of user's password (just being paranoid). */
 	memset(passwd, '\0', sizeof (passwd));
 }
+#endif
 
 /* Does the NTLMv2 owfs of a user's password */
 #if 0  /* function not needed yet - but will be soon */
@@ -223,7 +222,8 @@ SMBOWFencrypt(unsigned char passwd[16], unsigned char *c8,
 }
 
 /* Does the des encryption from the FIRST 8 BYTES of the NT or LM MD4 hash. */
-void
+#if 0 /* currently unused */
+static void
 NTLMSSPOWFencrypt(unsigned char passwd[8],
 		  unsigned char *ntlmchalresp, unsigned char p24[24])
 {
@@ -235,6 +235,7 @@ NTLMSSPOWFencrypt(unsigned char passwd[8],
 
 	E_P24(p21, ntlmchalresp, p24);
 }
+#endif
 
 /* Does the NT MD4 hash then des encryption. */
 
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 5597080cb811..95a54253c047 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -110,8 +110,6 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
 	        inode->i_nlink = attr->va_nlink;
 	if (attr->va_size != -1)
 	        inode->i_size = attr->va_size;
-	if (attr->va_blocksize != -1)
-		inode->i_blksize = attr->va_blocksize;
 	if (attr->va_size != -1)
 		inode->i_blocks = (attr->va_size + 511) >> 9;
 	if (attr->va_atime.tv_sec != -1) 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 71f2ea632e53..0102b28a15fb 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -304,7 +304,7 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode,
 	coda_dir_changed(dir_inode, 0);
 	atomic_inc(&inode->i_count);
 	d_instantiate(de, inode);
-	inode->i_nlink++;
+	inc_nlink(inode);
         
 out:
 	unlock_kernel();
@@ -367,7 +367,7 @@ int coda_unlink(struct inode *dir, struct dentry *de)
         }
 
 	coda_dir_changed(dir, 0);
-	de->d_inode->i_nlink--;
+	drop_nlink(de->d_inode);
 	unlock_kernel();
 
         return 0;
@@ -394,7 +394,7 @@ int coda_rmdir(struct inode *dir, struct dentry *de)
         }
 
 	coda_dir_changed(dir, -1);
-	de->d_inode->i_nlink--;
+	drop_nlink(de->d_inode);
 	d_delete(de);
 	unlock_kernel();
 
@@ -513,7 +513,7 @@ static int coda_venus_readdir(struct file *filp, filldir_t filldir,
 	ino_t ino;
 	int ret, i;
 
-	vdir = (struct venus_dirent *)kmalloc(sizeof(*vdir), GFP_KERNEL);
+	vdir = kmalloc(sizeof(*vdir), GFP_KERNEL);
 	if (!vdir) return -ENOMEM;
 
 	i = filp->f_pos;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 87f1dc8aa24b..88d123321164 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -80,8 +80,7 @@ int coda_init_inodecache(void)
 
 void coda_destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(coda_inode_cachep))
-		printk(KERN_INFO "coda_inode_cache: not all structures were freed\n");
+	kmem_cache_destroy(coda_inode_cachep);
 }
 
 static int coda_remount(struct super_block *sb, int *flags, char *data)
diff --git a/fs/compat.c b/fs/compat.c
index ce982f6e8c80..50624d4a70c6 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -44,7 +44,7 @@
 #include <linux/nfsd/syscall.h>
 #include <linux/personality.h>
 #include <linux/rwsem.h>
-#include <linux/acct.h>
+#include <linux/tsacct_kern.h>
 #include <linux/mm.h>
 
 #include <net/sock.h>		/* siocdevprivate_ioctl */
@@ -52,8 +52,7 @@
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/ioctls.h>
-
-extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
+#include "internal.h"
 
 int compat_log = 1;
 
@@ -69,6 +68,8 @@ int compat_printk(const char *fmt, ...)
 	return ret;
 }
 
+#include "read_write.h"
+
 /*
  * Not all architectures have sys_utime, so implement this in terms
  * of sys_utimes.
@@ -313,9 +314,6 @@ out:
 #define IOCTL_HASHSIZE 256
 static struct ioctl_trans *ioctl32_hash_table[IOCTL_HASHSIZE];
 
-extern struct ioctl_trans ioctl_start[];
-extern int ioctl_table_size;
-
 static inline unsigned long ioctl32_hash(unsigned long cmd)
 {
 	return (((cmd >> 6) ^ (cmd >> 4) ^ cmd)) % IOCTL_HASHSIZE;
@@ -838,8 +836,6 @@ static int do_nfs4_super_data_conv(void *raw_data)
 	return 0;
 }
 
-extern int copy_mount_options (const void __user *, unsigned long *);
-
 #define SMBFS_NAME      "smbfs"
 #define NCPFS_NAME      "ncpfs"
 #define NFS4_NAME	"nfs4"
@@ -918,20 +914,24 @@ struct compat_readdir_callback {
 };
 
 static int compat_fillonedir(void *__buf, const char *name, int namlen,
-			loff_t offset, ino_t ino, unsigned int d_type)
+			loff_t offset, u64 ino, unsigned int d_type)
 {
 	struct compat_readdir_callback *buf = __buf;
 	struct compat_old_linux_dirent __user *dirent;
+	compat_ulong_t d_ino;
 
 	if (buf->result)
 		return -EINVAL;
+	d_ino = ino;
+	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+		return -EOVERFLOW;
 	buf->result++;
 	dirent = buf->dirent;
 	if (!access_ok(VERIFY_WRITE, dirent,
 			(unsigned long)(dirent->d_name + namlen + 1) -
 				(unsigned long)dirent))
 		goto efault;
-	if (	__put_user(ino, &dirent->d_ino) ||
+	if (	__put_user(d_ino, &dirent->d_ino) ||
 		__put_user(offset, &dirent->d_offset) ||
 		__put_user(namlen, &dirent->d_namlen) ||
 		__copy_to_user(dirent->d_name, name, namlen) ||
@@ -982,22 +982,26 @@ struct compat_getdents_callback {
 };
 
 static int compat_filldir(void *__buf, const char *name, int namlen,
-		loff_t offset, ino_t ino, unsigned int d_type)
+		loff_t offset, u64 ino, unsigned int d_type)
 {
 	struct compat_linux_dirent __user * dirent;
 	struct compat_getdents_callback *buf = __buf;
+	compat_ulong_t d_ino;
 	int reclen = COMPAT_ROUND_UP(NAME_OFFSET(dirent) + namlen + 2);
 
 	buf->error = -EINVAL;	/* only used if we fail.. */
 	if (reclen > buf->count)
 		return -EINVAL;
+	d_ino = ino;
+	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+		return -EOVERFLOW;
 	dirent = buf->previous;
 	if (dirent) {
 		if (__put_user(offset, &dirent->d_off))
 			goto efault;
 	}
 	dirent = buf->current_dir;
-	if (__put_user(ino, &dirent->d_ino))
+	if (__put_user(d_ino, &dirent->d_ino))
 		goto efault;
 	if (__put_user(reclen, &dirent->d_reclen))
 		goto efault;
@@ -1068,7 +1072,7 @@ struct compat_getdents_callback64 {
 };
 
 static int compat_filldir64(void * __buf, const char * name, int namlen, loff_t offset,
-		     ino_t ino, unsigned int d_type)
+		     u64 ino, unsigned int d_type)
 {
 	struct linux_dirent64 __user *dirent;
 	struct compat_getdents_callback64 *buf = __buf;
@@ -1153,9 +1157,6 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 			       const struct compat_iovec __user *uvector,
 			       unsigned long nr_segs, loff_t *pos)
 {
-	typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
-	typedef ssize_t (*iov_fn_t)(struct file *, const struct iovec *, unsigned long, loff_t *);
-
 	compat_ssize_t tot_len;
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov=iovstack, *vector;
@@ -1238,39 +1239,18 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 	fnv = NULL;
 	if (type == READ) {
 		fn = file->f_op->read;
-		fnv = file->f_op->readv;
+		fnv = file->f_op->aio_read;
 	} else {
 		fn = (io_fn_t)file->f_op->write;
-		fnv = file->f_op->writev;
-	}
-	if (fnv) {
-		ret = fnv(file, iov, nr_segs, pos);
-		goto out;
+		fnv = file->f_op->aio_write;
 	}
 
-	/* Do it by hand, with file-ops */
-	ret = 0;
-	vector = iov;
-	while (nr_segs > 0) {
-		void __user * base;
-		size_t len;
-		ssize_t nr;
-
-		base = vector->iov_base;
-		len = vector->iov_len;
-		vector++;
-		nr_segs--;
-
-		nr = fn(file, base, len, pos);
+	if (fnv)
+		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
+						pos, fnv);
+	else
+		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
 
-		if (nr < 0) {
-			if (!ret) ret = nr;
-			break;
-		}
-		ret += nr;
-		if (nr != len)
-			break;
-	}
 out:
 	if (iov != iovstack)
 		kfree(iov);
@@ -1298,7 +1278,7 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign
 		goto out;
 
 	ret = -EINVAL;
-	if (!file->f_op || (!file->f_op->readv && !file->f_op->read))
+	if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
 		goto out;
 
 	ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos);
@@ -1321,7 +1301,7 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig
 		goto out;
 
 	ret = -EINVAL;
-	if (!file->f_op || (!file->f_op->writev && !file->f_op->write))
+	if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
 		goto out;
 
 	ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos);
@@ -1336,7 +1316,7 @@ compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32,
 		    unsigned int nr_segs, unsigned int flags)
 {
 	unsigned i;
-	struct iovec *iov;
+	struct iovec __user *iov;
 	if (nr_segs > UIO_MAXIOV)
 		return -EINVAL;
 	iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 4063a9396977..a91f2628c981 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -40,15 +40,11 @@
 #include <linux/if_pppox.h>
 #include <linux/mtio.h>
 #include <linux/cdrom.h>
-#include <linux/loop.h>
 #include <linux/auto_fs.h>
 #include <linux/auto_fs4.h>
 #include <linux/tty.h>
 #include <linux/vt_kern.h>
 #include <linux/fb.h>
-#include <linux/ext2_fs.h>
-#include <linux/ext3_jbd.h>
-#include <linux/ext3_fs.h>
 #include <linux/videodev.h>
 #include <linux/netdevice.h>
 #include <linux/raw.h>
@@ -60,12 +56,10 @@
 #include <linux/pci.h>
 #include <linux/module.h>
 #include <linux/serial.h>
-#include <linux/reiserfs_fs.h>
 #include <linux/if_tun.h>
 #include <linux/ctype.h>
 #include <linux/ioctl32.h>
 #include <linux/syscalls.h>
-#include <linux/ncp_fs.h>
 #include <linux/i2c.h>
 #include <linux/i2c-dev.h>
 #include <linux/wireless.h>
@@ -113,7 +107,6 @@
 #include <linux/nbd.h>
 #include <linux/random.h>
 #include <linux/filter.h>
-#include <linux/msdos_fs.h>
 #include <linux/pktcdvd.h>
 
 #include <linux/hiddev.h>
@@ -124,21 +117,6 @@
 #include <linux/dvb/video.h>
 #include <linux/lp.h>
 
-/* Aiee. Someone does not find a difference between int and long */
-#define EXT2_IOC32_GETFLAGS               _IOR('f', 1, int)
-#define EXT2_IOC32_SETFLAGS               _IOW('f', 2, int)
-#define EXT3_IOC32_GETVERSION             _IOR('f', 3, int)
-#define EXT3_IOC32_SETVERSION             _IOW('f', 4, int)
-#define EXT3_IOC32_GETRSVSZ               _IOR('f', 5, int)
-#define EXT3_IOC32_SETRSVSZ               _IOW('f', 6, int)
-#define EXT3_IOC32_GROUP_EXTEND           _IOW('f', 7, unsigned int)
-#ifdef CONFIG_JBD_DEBUG
-#define EXT3_IOC32_WAIT_FOR_READONLY      _IOR('f', 99, int)
-#endif
-
-#define EXT2_IOC32_GETVERSION             _IOR('v', 1, int)
-#define EXT2_IOC32_SETVERSION             _IOW('v', 2, int)
-
 static int do_ioctl32_pointer(unsigned int fd, unsigned int cmd,
 			      unsigned long arg, struct file *f)
 {
@@ -176,34 +154,6 @@ static int rw_long(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
-static int do_ext2_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	/* These are just misnamed, they actually get/put from/to user an int */
-	switch (cmd) {
-	case EXT2_IOC32_GETFLAGS: cmd = EXT2_IOC_GETFLAGS; break;
-	case EXT2_IOC32_SETFLAGS: cmd = EXT2_IOC_SETFLAGS; break;
-	case EXT2_IOC32_GETVERSION: cmd = EXT2_IOC_GETVERSION; break;
-	case EXT2_IOC32_SETVERSION: cmd = EXT2_IOC_SETVERSION; break;
-	}
-	return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg));
-}
-
-static int do_ext3_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	/* These are just misnamed, they actually get/put from/to user an int */
-	switch (cmd) {
-	case EXT3_IOC32_GETVERSION: cmd = EXT3_IOC_GETVERSION; break;
-	case EXT3_IOC32_SETVERSION: cmd = EXT3_IOC_SETVERSION; break;
-	case EXT3_IOC32_GETRSVSZ: cmd = EXT3_IOC_GETRSVSZ; break;
-	case EXT3_IOC32_SETRSVSZ: cmd = EXT3_IOC_SETRSVSZ; break;
-	case EXT3_IOC32_GROUP_EXTEND: cmd = EXT3_IOC_GROUP_EXTEND; break;
-#ifdef CONFIG_JBD_DEBUG
-	case EXT3_IOC32_WAIT_FOR_READONLY: cmd = EXT3_IOC_WAIT_FOR_READONLY; break;
-#endif
-	}
-	return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg));
-}
-
 struct compat_video_event {
 	int32_t		type;
 	compat_time_t	timestamp;
@@ -694,6 +644,7 @@ out:
 }
 #endif
 
+#ifdef CONFIG_BLOCK
 struct hd_geometry32 {
 	unsigned char heads;
 	unsigned char sectors;
@@ -918,6 +869,7 @@ static int sg_grt_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 	}
 	return err;
 }
+#endif /* CONFIG_BLOCK */
 
 struct sock_fprog32 {
 	unsigned short	len;
@@ -1041,6 +993,7 @@ static int ppp_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 }
 
 
+#ifdef CONFIG_BLOCK
 struct mtget32 {
 	compat_long_t	mt_type;
 	compat_long_t	mt_resid;
@@ -1213,73 +1166,7 @@ static int cdrom_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long ar
 
 	return err;
 }
-
-struct loop_info32 {
-	compat_int_t	lo_number;      /* ioctl r/o */
-	compat_dev_t	lo_device;      /* ioctl r/o */
-	compat_ulong_t	lo_inode;       /* ioctl r/o */
-	compat_dev_t	lo_rdevice;     /* ioctl r/o */
-	compat_int_t	lo_offset;
-	compat_int_t	lo_encrypt_type;
-	compat_int_t	lo_encrypt_key_size;    /* ioctl w/o */
-	compat_int_t	lo_flags;       /* ioctl r/o */
-	char		lo_name[LO_NAME_SIZE];
-	unsigned char	lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */
-	compat_ulong_t	lo_init[2];
-	char		reserved[4];
-};
-
-static int loop_status(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	mm_segment_t old_fs = get_fs();
-	struct loop_info l;
-	struct loop_info32 __user *ul;
-	int err = -EINVAL;
-
-	ul = compat_ptr(arg);
-	switch(cmd) {
-	case LOOP_SET_STATUS:
-		err = get_user(l.lo_number, &ul->lo_number);
-		err |= __get_user(l.lo_device, &ul->lo_device);
-		err |= __get_user(l.lo_inode, &ul->lo_inode);
-		err |= __get_user(l.lo_rdevice, &ul->lo_rdevice);
-		err |= __copy_from_user(&l.lo_offset, &ul->lo_offset,
-		        8 + (unsigned long)l.lo_init - (unsigned long)&l.lo_offset);
-		if (err) {
-			err = -EFAULT;
-		} else {
-			set_fs (KERNEL_DS);
-			err = sys_ioctl (fd, cmd, (unsigned long)&l);
-			set_fs (old_fs);
-		}
-		break;
-	case LOOP_GET_STATUS:
-		set_fs (KERNEL_DS);
-		err = sys_ioctl (fd, cmd, (unsigned long)&l);
-		set_fs (old_fs);
-		if (!err) {
-			err = put_user(l.lo_number, &ul->lo_number);
-			err |= __put_user(l.lo_device, &ul->lo_device);
-			err |= __put_user(l.lo_inode, &ul->lo_inode);
-			err |= __put_user(l.lo_rdevice, &ul->lo_rdevice);
-			err |= __copy_to_user(&ul->lo_offset, &l.lo_offset,
-				(unsigned long)l.lo_init - (unsigned long)&l.lo_offset);
-			if (err)
-				err = -EFAULT;
-		}
-		break;
-	default: {
-		static int count;
-		if (++count <= 20)
-			printk("%s: Unknown loop ioctl cmd, fd(%d) "
-			       "cmd(%08x) arg(%08lx)\n",
-			       __FUNCTION__, fd, cmd, arg);
-	}
-	}
-	return err;
-}
-
-extern int tty_ioctl(struct inode * inode, struct file * file, unsigned int cmd, unsigned long arg);
+#endif /* CONFIG_BLOCK */
 
 #ifdef CONFIG_VT
 
@@ -1607,6 +1494,7 @@ ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return -EINVAL;
 }
 
+#ifdef CONFIG_BLOCK
 static int broken_blkgetsize(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
 	/* The mkswap binary hard codes it to Intel value :-((( */
@@ -1641,12 +1529,14 @@ static int blkpg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long ar
 
 	return sys_ioctl(fd, cmd, (unsigned long)a);
 }
+#endif
 
 static int ioc_settimeout(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
 	return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, arg);
 }
 
+#ifdef CONFIG_BLOCK
 /* Fix sizeof(sizeof()) breakage */
 #define BLKBSZGET_32   _IOR(0x12,112,int)
 #define BLKBSZSET_32   _IOW(0x12,113,int)
@@ -1667,6 +1557,7 @@ static int do_blkgetsize64(unsigned int fd, unsigned int cmd,
 {
        return sys_ioctl(fd, BLKGETSIZE64, (unsigned long)compat_ptr(arg));
 }
+#endif
 
 /* Bluetooth ioctls */
 #define HCIUARTSETPROTO	_IOW('U', 200, int)
@@ -1687,6 +1578,7 @@ static int do_blkgetsize64(unsigned int fd, unsigned int cmd,
 #define HIDPGETCONNLIST	_IOR('H', 210, int)
 #define HIDPGETCONNINFO	_IOR('H', 211, int)
 
+#ifdef CONFIG_BLOCK
 struct floppy_struct32 {
 	compat_uint_t	size;
 	compat_uint_t	sect;
@@ -2011,6 +1903,7 @@ out:
 	kfree(karg);
 	return err;
 }
+#endif
 
 struct mtd_oob_buf32 {
 	u_int32_t start;
@@ -2052,61 +1945,7 @@ static int mtd_rw_oob(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return err;
 }	
 
-#define	VFAT_IOCTL_READDIR_BOTH32	_IOR('r', 1, struct compat_dirent[2])
-#define	VFAT_IOCTL_READDIR_SHORT32	_IOR('r', 2, struct compat_dirent[2])
-
-static long
-put_dirent32 (struct dirent *d, struct compat_dirent __user *d32)
-{
-        if (!access_ok(VERIFY_WRITE, d32, sizeof(struct compat_dirent)))
-                return -EFAULT;
-
-        __put_user(d->d_ino, &d32->d_ino);
-        __put_user(d->d_off, &d32->d_off);
-        __put_user(d->d_reclen, &d32->d_reclen);
-        if (__copy_to_user(d32->d_name, d->d_name, d->d_reclen))
-		return -EFAULT;
-
-        return 0;
-}
-
-static int vfat_ioctl32(unsigned fd, unsigned cmd, unsigned long arg)
-{
-	struct compat_dirent __user *p = compat_ptr(arg);
-	int ret;
-	mm_segment_t oldfs = get_fs();
-	struct dirent d[2];
-
-	switch(cmd)
-	{
-        	case VFAT_IOCTL_READDIR_BOTH32:
-                	cmd = VFAT_IOCTL_READDIR_BOTH;
-                	break;
-        	case VFAT_IOCTL_READDIR_SHORT32:
-                	cmd = VFAT_IOCTL_READDIR_SHORT;
-                	break;
-	}
-
-	set_fs(KERNEL_DS);
-	ret = sys_ioctl(fd,cmd,(unsigned long)&d);
-	set_fs(oldfs);
-	if (ret >= 0) {
-		ret |= put_dirent32(&d[0], p);
-		ret |= put_dirent32(&d[1], p + 1);
-	}
-	return ret;
-}
-
-#define REISERFS_IOC_UNPACK32               _IOW(0xCD,1,int)
-
-static int reiserfs_ioctl32(unsigned fd, unsigned cmd, unsigned long ptr)
-{
-        if (cmd == REISERFS_IOC_UNPACK32)
-                cmd = REISERFS_IOC_UNPACK;
-
-        return sys_ioctl(fd,cmd,ptr);
-}
-
+#ifdef CONFIG_BLOCK
 struct raw32_config_request
 {
         compat_int_t    raw_minor;
@@ -2171,6 +2010,7 @@ static int raw_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
         }
         return ret;
 }
+#endif /* CONFIG_BLOCK */
 
 struct serial_struct32 {
         compat_int_t    type;
@@ -2507,193 +2347,6 @@ static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
 	}
 }
 
-#if defined(CONFIG_NCP_FS) || defined(CONFIG_NCP_FS_MODULE)
-struct ncp_ioctl_request_32 {
-	u32 function;
-	u32 size;
-	compat_caddr_t data;
-};
-
-struct ncp_fs_info_v2_32 {
-	s32 version;
-	u32 mounted_uid;
-	u32 connection;
-	u32 buffer_size;
-
-	u32 volume_number;
-	u32 directory_id;
-
-	u32 dummy1;
-	u32 dummy2;
-	u32 dummy3;
-};
-
-struct ncp_objectname_ioctl_32
-{
-	s32		auth_type;
-	u32		object_name_len;
-	compat_caddr_t	object_name;	/* an userspace data, in most cases user name */
-};
-
-struct ncp_privatedata_ioctl_32
-{
-	u32		len;
-	compat_caddr_t	data;		/* ~1000 for NDS */
-};
-
-#define	NCP_IOC_NCPREQUEST_32		_IOR('n', 1, struct ncp_ioctl_request_32)
-#define NCP_IOC_GETMOUNTUID2_32		_IOW('n', 2, u32)
-#define NCP_IOC_GET_FS_INFO_V2_32	_IOWR('n', 4, struct ncp_fs_info_v2_32)
-#define NCP_IOC_GETOBJECTNAME_32	_IOWR('n', 9, struct ncp_objectname_ioctl_32)
-#define NCP_IOC_SETOBJECTNAME_32	_IOR('n', 9, struct ncp_objectname_ioctl_32)
-#define NCP_IOC_GETPRIVATEDATA_32	_IOWR('n', 10, struct ncp_privatedata_ioctl_32)
-#define NCP_IOC_SETPRIVATEDATA_32	_IOR('n', 10, struct ncp_privatedata_ioctl_32)
-
-static int do_ncp_ncprequest(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct ncp_ioctl_request_32 n32;
-	struct ncp_ioctl_request __user *p = compat_alloc_user_space(sizeof(*p));
-
-	if (copy_from_user(&n32, compat_ptr(arg), sizeof(n32)) ||
-	    put_user(n32.function, &p->function) ||
-	    put_user(n32.size, &p->size) ||
-	    put_user(compat_ptr(n32.data), &p->data))
-		return -EFAULT;
-
-	return sys_ioctl(fd, NCP_IOC_NCPREQUEST, (unsigned long)p);
-}
-
-static int do_ncp_getmountuid2(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	mm_segment_t old_fs = get_fs();
-	__kernel_uid_t kuid;
-	int err;
-
-	cmd = NCP_IOC_GETMOUNTUID2;
-
-	set_fs(KERNEL_DS);
-	err = sys_ioctl(fd, cmd, (unsigned long)&kuid);
-	set_fs(old_fs);
-
-	if (!err)
-		err = put_user(kuid,
-			       (unsigned int __user *) compat_ptr(arg));
-
-	return err;
-}
-
-static int do_ncp_getfsinfo2(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	mm_segment_t old_fs = get_fs();
-	struct ncp_fs_info_v2_32 n32;
-	struct ncp_fs_info_v2 n;
-	int err;
-
-	if (copy_from_user(&n32, compat_ptr(arg), sizeof(n32)))
-		return -EFAULT;
-	if (n32.version != NCP_GET_FS_INFO_VERSION_V2)
-		return -EINVAL;
-	n.version = NCP_GET_FS_INFO_VERSION_V2;
-
-	set_fs(KERNEL_DS);
-	err = sys_ioctl(fd, NCP_IOC_GET_FS_INFO_V2, (unsigned long)&n);
-	set_fs(old_fs);
-
-	if (!err) {
-		n32.version = n.version;
-		n32.mounted_uid = n.mounted_uid;
-		n32.connection = n.connection;
-		n32.buffer_size = n.buffer_size;
-		n32.volume_number = n.volume_number;
-		n32.directory_id = n.directory_id;
-		n32.dummy1 = n.dummy1;
-		n32.dummy2 = n.dummy2;
-		n32.dummy3 = n.dummy3;
-		err = copy_to_user(compat_ptr(arg), &n32, sizeof(n32)) ? -EFAULT : 0;
-	}
-	return err;
-}
-
-static int do_ncp_getobjectname(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct ncp_objectname_ioctl_32 n32, __user *p32 = compat_ptr(arg);
-	struct ncp_objectname_ioctl __user *p = compat_alloc_user_space(sizeof(*p));
-	s32 auth_type;
-	u32 name_len;
-	int err;
-
-	if (copy_from_user(&n32, p32, sizeof(n32)) ||
-	    put_user(n32.object_name_len, &p->object_name_len) ||
-	    put_user(compat_ptr(n32.object_name), &p->object_name))
-		return -EFAULT;
-
-	err = sys_ioctl(fd, NCP_IOC_GETOBJECTNAME, (unsigned long)p);
-        if (err)
-		return err;
-
-	if (get_user(auth_type, &p->auth_type) ||
-	    put_user(auth_type, &p32->auth_type) ||
-	    get_user(name_len, &p->object_name_len) ||
-	    put_user(name_len, &p32->object_name_len))
-		return -EFAULT;
-
-	return 0;
-}
-
-static int do_ncp_setobjectname(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct ncp_objectname_ioctl_32 n32, __user *p32 = compat_ptr(arg);
-	struct ncp_objectname_ioctl __user *p = compat_alloc_user_space(sizeof(*p));
-
-	if (copy_from_user(&n32, p32, sizeof(n32)) ||
-	    put_user(n32.auth_type, &p->auth_type) ||
-	    put_user(n32.object_name_len, &p->object_name_len) ||
-	    put_user(compat_ptr(n32.object_name), &p->object_name))
-		return -EFAULT;
-
-	return sys_ioctl(fd, NCP_IOC_SETOBJECTNAME, (unsigned long)p);
-}
-
-static int do_ncp_getprivatedata(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct ncp_privatedata_ioctl_32 n32, __user *p32 = compat_ptr(arg);
-	struct ncp_privatedata_ioctl __user *p =
-		compat_alloc_user_space(sizeof(*p));
-	u32 len;
-	int err;
-
-	if (copy_from_user(&n32, p32, sizeof(n32)) ||
-	    put_user(n32.len, &p->len) ||
-	    put_user(compat_ptr(n32.data), &p->data))
-		return -EFAULT;
-
-	err = sys_ioctl(fd, NCP_IOC_GETPRIVATEDATA, (unsigned long)p);
-        if (err)
-		return err;
-
-	if (get_user(len, &p->len) ||
-	    put_user(len, &p32->len))
-		return -EFAULT;
-
-	return 0;
-}
-
-static int do_ncp_setprivatedata(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct ncp_privatedata_ioctl_32 n32;
-	struct ncp_privatedata_ioctl_32 __user *p32 = compat_ptr(arg);
-	struct ncp_privatedata_ioctl __user *p =
-		compat_alloc_user_space(sizeof(*p));
-
-	if (copy_from_user(&n32, p32, sizeof(n32)) ||
-	    put_user(n32.len, &p->len) ||
-	    put_user(compat_ptr(n32.data), &p->data))
-		return -EFAULT;
-
-	return sys_ioctl(fd, NCP_IOC_SETPRIVATEDATA, (unsigned long)p);
-}
-#endif
-
 static int
 lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
@@ -2777,6 +2430,7 @@ HANDLE_IOCTL(SIOCBRDELIF, dev_ifsioc)
 HANDLE_IOCTL(SIOCRTMSG, ret_einval)
 HANDLE_IOCTL(SIOCGSTAMP, do_siocgstamp)
 #endif
+#ifdef CONFIG_BLOCK
 HANDLE_IOCTL(HDIO_GETGEO, hdio_getgeo)
 HANDLE_IOCTL(BLKRAGET, w_long)
 HANDLE_IOCTL(BLKGETSIZE, w_long)
@@ -2784,13 +2438,17 @@ HANDLE_IOCTL(0x1260, broken_blkgetsize)
 HANDLE_IOCTL(BLKFRAGET, w_long)
 HANDLE_IOCTL(BLKSECTGET, w_long)
 HANDLE_IOCTL(BLKPG, blkpg_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_KEEPSETTINGS, hdio_ioctl_trans)
 HANDLE_IOCTL(HDIO_GET_UNMASKINTR, hdio_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_DMA, hdio_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_32BIT, hdio_ioctl_trans)
 HANDLE_IOCTL(HDIO_GET_MULTCOUNT, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_KEEPSETTINGS, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_32BIT, hdio_ioctl_trans)
 HANDLE_IOCTL(HDIO_GET_NOWERR, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_DMA, hdio_ioctl_trans)
 HANDLE_IOCTL(HDIO_GET_NICE, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_WCACHE, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_ACOUSTIC, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_ADDRESS, hdio_ioctl_trans)
+HANDLE_IOCTL(HDIO_GET_BUSSTATE, hdio_ioctl_trans)
 HANDLE_IOCTL(FDSETPRM32, fd_ioctl_trans)
 HANDLE_IOCTL(FDDEFPRM32, fd_ioctl_trans)
 HANDLE_IOCTL(FDGETPRM32, fd_ioctl_trans)
@@ -2802,16 +2460,17 @@ HANDLE_IOCTL(FDGETFDCSTAT32, fd_ioctl_trans)
 HANDLE_IOCTL(FDWERRORGET32, fd_ioctl_trans)
 HANDLE_IOCTL(SG_IO,sg_ioctl_trans)
 HANDLE_IOCTL(SG_GET_REQUEST_TABLE, sg_grt_trans)
+#endif
 HANDLE_IOCTL(PPPIOCGIDLE32, ppp_ioctl_trans)
 HANDLE_IOCTL(PPPIOCSCOMPRESS32, ppp_ioctl_trans)
 HANDLE_IOCTL(PPPIOCSPASS32, ppp_sock_fprog_ioctl_trans)
 HANDLE_IOCTL(PPPIOCSACTIVE32, ppp_sock_fprog_ioctl_trans)
+#ifdef CONFIG_BLOCK
 HANDLE_IOCTL(MTIOCGET32, mt_ioctl_trans)
 HANDLE_IOCTL(MTIOCPOS32, mt_ioctl_trans)
 HANDLE_IOCTL(CDROMREADAUDIO, cdrom_ioctl_trans)
 HANDLE_IOCTL(CDROM_SEND_PACKET, cdrom_ioctl_trans)
-HANDLE_IOCTL(LOOP_SET_STATUS, loop_status)
-HANDLE_IOCTL(LOOP_GET_STATUS, loop_status)
+#endif
 #define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int)
 HANDLE_IOCTL(AUTOFS_IOC_SETTIMEOUT32, ioc_settimeout)
 #ifdef CONFIG_VT
@@ -2821,19 +2480,6 @@ HANDLE_IOCTL(PIO_UNIMAP, do_unimap_ioctl)
 HANDLE_IOCTL(GIO_UNIMAP, do_unimap_ioctl)
 HANDLE_IOCTL(KDFONTOP, do_kdfontop_ioctl)
 #endif
-HANDLE_IOCTL(EXT2_IOC32_GETFLAGS, do_ext2_ioctl)
-HANDLE_IOCTL(EXT2_IOC32_SETFLAGS, do_ext2_ioctl)
-HANDLE_IOCTL(EXT2_IOC32_GETVERSION, do_ext2_ioctl)
-HANDLE_IOCTL(EXT2_IOC32_SETVERSION, do_ext2_ioctl)
-HANDLE_IOCTL(EXT3_IOC32_GETVERSION, do_ext3_ioctl)
-HANDLE_IOCTL(EXT3_IOC32_SETVERSION, do_ext3_ioctl)
-HANDLE_IOCTL(EXT3_IOC32_GETRSVSZ, do_ext3_ioctl)
-HANDLE_IOCTL(EXT3_IOC32_SETRSVSZ, do_ext3_ioctl)
-HANDLE_IOCTL(EXT3_IOC32_GROUP_EXTEND, do_ext3_ioctl)
-COMPATIBLE_IOCTL(EXT3_IOC_GROUP_ADD)
-#ifdef CONFIG_JBD_DEBUG
-HANDLE_IOCTL(EXT3_IOC32_WAIT_FOR_READONLY, do_ext3_ioctl)
-#endif
 /* One SMB ioctl needs translations. */
 #define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t)
 HANDLE_IOCTL(SMB_IOC_GETMOUNTUID_32, do_smb_getmountuid)
@@ -2863,16 +2509,14 @@ HANDLE_IOCTL(SONET_SETFRAMING, do_atm_ioctl)
 HANDLE_IOCTL(SONET_GETFRAMING, do_atm_ioctl)
 HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl)
 /* block stuff */
+#ifdef CONFIG_BLOCK
 HANDLE_IOCTL(BLKBSZGET_32, do_blkbszget)
 HANDLE_IOCTL(BLKBSZSET_32, do_blkbszset)
 HANDLE_IOCTL(BLKGETSIZE64_32, do_blkgetsize64)
-/* vfat */
-HANDLE_IOCTL(VFAT_IOCTL_READDIR_BOTH32, vfat_ioctl32)
-HANDLE_IOCTL(VFAT_IOCTL_READDIR_SHORT32, vfat_ioctl32)
-HANDLE_IOCTL(REISERFS_IOC_UNPACK32, reiserfs_ioctl32)
 /* Raw devices */
 HANDLE_IOCTL(RAW_SETBIND, raw_ioctl)
 HANDLE_IOCTL(RAW_GETBIND, raw_ioctl)
+#endif
 /* Serial */
 HANDLE_IOCTL(TIOCGSERIAL, serial_struct_ioctl)
 HANDLE_IOCTL(TIOCSSERIAL, serial_struct_ioctl)
@@ -2920,16 +2564,6 @@ HANDLE_IOCTL(RTC_IRQP_SET32, rtc_ioctl)
 HANDLE_IOCTL(RTC_EPOCH_READ32, rtc_ioctl)
 HANDLE_IOCTL(RTC_EPOCH_SET32, rtc_ioctl)
 
-#if defined(CONFIG_NCP_FS) || defined(CONFIG_NCP_FS_MODULE)
-HANDLE_IOCTL(NCP_IOC_NCPREQUEST_32, do_ncp_ncprequest)
-HANDLE_IOCTL(NCP_IOC_GETMOUNTUID2_32, do_ncp_getmountuid2)
-HANDLE_IOCTL(NCP_IOC_GET_FS_INFO_V2_32, do_ncp_getfsinfo2)
-HANDLE_IOCTL(NCP_IOC_GETOBJECTNAME_32, do_ncp_getobjectname)
-HANDLE_IOCTL(NCP_IOC_SETOBJECTNAME_32, do_ncp_setobjectname)
-HANDLE_IOCTL(NCP_IOC_GETPRIVATEDATA_32, do_ncp_getprivatedata)
-HANDLE_IOCTL(NCP_IOC_SETPRIVATEDATA_32, do_ncp_setprivatedata)
-#endif
-
 /* dvb */
 HANDLE_IOCTL(VIDEO_GET_EVENT, do_video_get_event)
 HANDLE_IOCTL(VIDEO_STILLPICTURE, do_video_stillpicture)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 816e8ef64560..8a3b6a1a6ad1 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -139,7 +139,7 @@ static int init_dir(struct inode * inode)
 	inode->i_fop = &configfs_dir_operations;
 
 	/* directory inodes start off with i_nlink == 2 (for "." entry) */
-	inode->i_nlink++;
+	inc_nlink(inode);
 	return 0;
 }
 
@@ -169,7 +169,7 @@ static int create_dir(struct config_item * k, struct dentry * p,
 	if (!error) {
 		error = configfs_create(d, mode, init_dir);
 		if (!error) {
-			p->d_inode->i_nlink++;
+			inc_nlink(p->d_inode);
 			(d)->d_op = &configfs_dentry_ops;
 		} else {
 			struct configfs_dirent *sd = d->d_fsdata;
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index f499803743e0..cf33fac68c84 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -137,8 +137,8 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
 		if ((retval = fill_read_buffer(file->f_dentry,buffer)))
 			goto out;
 	}
-	pr_debug("%s: count = %d, ppos = %lld, buf = %s\n",
-		 __FUNCTION__,count,*ppos,buffer->page);
+	pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
+		 __FUNCTION__, count, *ppos, buffer->page);
 	retval = flush_read_buffer(buffer,buf,count,ppos);
 out:
 	up(&buffer->sem);
@@ -274,15 +274,15 @@ static int check_perm(struct inode * inode, struct file * file)
 	/* No error? Great, allocate a buffer for the file, and store it
 	 * it in file->private_data for easy access.
 	 */
-	buffer = kmalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
-	if (buffer) {
-		memset(buffer,0,sizeof(struct configfs_buffer));
-		init_MUTEX(&buffer->sem);
-		buffer->needs_read_fill = 1;
-		buffer->ops = ops;
-		file->private_data = buffer;
-	} else
+	buffer = kzalloc(sizeof(struct configfs_buffer),GFP_KERNEL);
+	if (!buffer) {
 		error = -ENOMEM;
+		goto Enomem;
+	}
+	init_MUTEX(&buffer->sem);
+	buffer->needs_read_fill = 1;
+	buffer->ops = ops;
+	file->private_data = buffer;
 	goto Done;
 
  Einval:
@@ -290,6 +290,7 @@ static int check_perm(struct inode * inode, struct file * file)
 	goto Done;
  Eaccess:
 	error = -EACCES;
+ Enomem:
 	module_put(attr->ca_owner);
  Done:
 	if (error && item)
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index e14488ca6411..fb18917954a9 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -76,11 +76,10 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
 
 	if (!sd_iattr) {
 		/* setting attributes for the first time, allocate now */
-		sd_iattr = kmalloc(sizeof(struct iattr), GFP_KERNEL);
+		sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
 		if (!sd_iattr)
 			return -ENOMEM;
 		/* assign default attributes */
-		memset(sd_iattr, 0, sizeof(struct iattr));
 		sd_iattr->ia_mode = sd->s_mode;
 		sd_iattr->ia_uid = 0;
 		sd_iattr->ia_gid = 0;
@@ -136,7 +135,6 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
 {
 	struct inode * inode = new_inode(configfs_sb);
 	if (inode) {
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_mapping->a_ops = &configfs_aops;
 		inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index e07485ac50ad..24421209f854 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -224,4 +224,4 @@ EXPORT_SYMBOL(config_item_init);
 EXPORT_SYMBOL(config_group_init);
 EXPORT_SYMBOL(config_item_get);
 EXPORT_SYMBOL(config_item_put);
-
+EXPORT_SYMBOL(config_group_find_obj);
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 3e5fe843e1df..68bd5c93ca52 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -84,7 +84,7 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
 		inode->i_op = &configfs_dir_inode_operations;
 		inode->i_fop = &configfs_dir_operations;
 		/* directory inodes start off with i_nlink == 2 (for "." entry) */
-		inode->i_nlink++;
+		inc_nlink(inode);
 	} else {
 		pr_debug("configfs: could not get root inode\n");
 		return -ENOMEM;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 223c0431042d..a624c3ec8189 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -73,7 +73,6 @@ static int cramfs_iget5_set(struct inode *inode, void *opaque)
 	inode->i_uid = cramfs_inode->uid;
 	inode->i_size = cramfs_inode->size;
 	inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
-	inode->i_blksize = PAGE_CACHE_SIZE;
 	inode->i_gid = cramfs_inode->gid;
 	/* Struct copy intentional */
 	inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
@@ -242,11 +241,10 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_flags |= MS_RDONLY;
 
-	sbi = kmalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL);
+	sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
 	sb->s_fs_info = sbi;
-	memset(sbi, 0, sizeof(struct cramfs_sb_info));
 
 	/* Invalidate the read buffers on mount: think disk change.. */
 	mutex_lock(&read_mutex);
@@ -545,8 +543,15 @@ static struct file_system_type cramfs_fs_type = {
 
 static int __init init_cramfs_fs(void)
 {
-	cramfs_uncompress_init();
-	return register_filesystem(&cramfs_fs_type);
+	int rv;
+
+	rv = cramfs_uncompress_init();
+	if (rv < 0)
+		return rv;
+	rv = register_filesystem(&cramfs_fs_type);
+	if (rv < 0)
+		cramfs_uncompress_exit();
+	return rv;
 }
 
 static void __exit exit_cramfs_fs(void)
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
index 8def89f2c438..fc3ccb74626f 100644
--- a/fs/cramfs/uncompress.c
+++ b/fs/cramfs/uncompress.c
@@ -68,11 +68,10 @@ int cramfs_uncompress_init(void)
 	return 0;
 }
 
-int cramfs_uncompress_exit(void)
+void cramfs_uncompress_exit(void)
 {
 	if (!--initialized) {
 		zlib_inflateEnd(&stream);
 		vfree(stream.workspace);
 	}
-	return 0;
 }
diff --git a/fs/dcache.c b/fs/dcache.c
index 17b392a2049e..a1ff91eef108 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
 #include <linux/seqlock.h>
 #include <linux/swap.h>
 #include <linux/bootmem.h>
+#include "internal.h"
 
 
 int sysctl_vfs_cache_pressure __read_mostly = 100;
@@ -290,9 +291,9 @@ struct dentry * dget_locked(struct dentry *dentry)
  * it can be unhashed only if it has no children, or if it is the root
  * of a filesystem.
  *
- * If the inode has a DCACHE_DISCONNECTED alias, then prefer
+ * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
  * any other hashed alias over that one unless @want_discon is set,
- * in which case only return a DCACHE_DISCONNECTED alias.
+ * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
  */
 
 static struct dentry * __d_find_alias(struct inode *inode, int want_discon)
@@ -308,7 +309,8 @@ static struct dentry * __d_find_alias(struct inode *inode, int want_discon)
 		prefetch(next);
 		alias = list_entry(tmp, struct dentry, d_alias);
  		if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
-			if (alias->d_flags & DCACHE_DISCONNECTED)
+			if (IS_ROOT(alias) &&
+			    (alias->d_flags & DCACHE_DISCONNECTED))
 				discon_alias = alias;
 			else if (!want_discon) {
 				__dget_locked(alias);
@@ -547,6 +549,136 @@ repeat:
 }
 
 /*
+ * destroy a single subtree of dentries for unmount
+ * - see the comments on shrink_dcache_for_umount() for a description of the
+ *   locking
+ */
+static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
+{
+	struct dentry *parent;
+
+	BUG_ON(!IS_ROOT(dentry));
+
+	/* detach this root from the system */
+	spin_lock(&dcache_lock);
+	if (!list_empty(&dentry->d_lru)) {
+		dentry_stat.nr_unused--;
+		list_del_init(&dentry->d_lru);
+	}
+	__d_drop(dentry);
+	spin_unlock(&dcache_lock);
+
+	for (;;) {
+		/* descend to the first leaf in the current subtree */
+		while (!list_empty(&dentry->d_subdirs)) {
+			struct dentry *loop;
+
+			/* this is a branch with children - detach all of them
+			 * from the system in one go */
+			spin_lock(&dcache_lock);
+			list_for_each_entry(loop, &dentry->d_subdirs,
+					    d_u.d_child) {
+				if (!list_empty(&loop->d_lru)) {
+					dentry_stat.nr_unused--;
+					list_del_init(&loop->d_lru);
+				}
+
+				__d_drop(loop);
+				cond_resched_lock(&dcache_lock);
+			}
+			spin_unlock(&dcache_lock);
+
+			/* move to the first child */
+			dentry = list_entry(dentry->d_subdirs.next,
+					    struct dentry, d_u.d_child);
+		}
+
+		/* consume the dentries from this leaf up through its parents
+		 * until we find one with children or run out altogether */
+		do {
+			struct inode *inode;
+
+			if (atomic_read(&dentry->d_count) != 0) {
+				printk(KERN_ERR
+				       "BUG: Dentry %p{i=%lx,n=%s}"
+				       " still in use (%d)"
+				       " [unmount of %s %s]\n",
+				       dentry,
+				       dentry->d_inode ?
+				       dentry->d_inode->i_ino : 0UL,
+				       dentry->d_name.name,
+				       atomic_read(&dentry->d_count),
+				       dentry->d_sb->s_type->name,
+				       dentry->d_sb->s_id);
+				BUG();
+			}
+
+			parent = dentry->d_parent;
+			if (parent == dentry)
+				parent = NULL;
+			else
+				atomic_dec(&parent->d_count);
+
+			list_del(&dentry->d_u.d_child);
+			dentry_stat.nr_dentry--;	/* For d_free, below */
+
+			inode = dentry->d_inode;
+			if (inode) {
+				dentry->d_inode = NULL;
+				list_del_init(&dentry->d_alias);
+				if (dentry->d_op && dentry->d_op->d_iput)
+					dentry->d_op->d_iput(dentry, inode);
+				else
+					iput(inode);
+			}
+
+			d_free(dentry);
+
+			/* finished when we fall off the top of the tree,
+			 * otherwise we ascend to the parent and move to the
+			 * next sibling if there is one */
+			if (!parent)
+				return;
+
+			dentry = parent;
+
+		} while (list_empty(&dentry->d_subdirs));
+
+		dentry = list_entry(dentry->d_subdirs.next,
+				    struct dentry, d_u.d_child);
+	}
+}
+
+/*
+ * destroy the dentries attached to a superblock on unmounting
+ * - we don't need to use dentry->d_lock, and only need dcache_lock when
+ *   removing the dentry from the system lists and hashes because:
+ *   - the superblock is detached from all mountings and open files, so the
+ *     dentry trees will not be rearranged by the VFS
+ *   - s_umount is write-locked, so the memory pressure shrinker will ignore
+ *     any dentries belonging to this superblock that it comes across
+ *   - the filesystem itself is no longer permitted to rearrange the dentries
+ *     in this superblock
+ */
+void shrink_dcache_for_umount(struct super_block *sb)
+{
+	struct dentry *dentry;
+
+	if (down_read_trylock(&sb->s_umount))
+		BUG();
+
+	dentry = sb->s_root;
+	sb->s_root = NULL;
+	atomic_dec(&dentry->d_count);
+	shrink_dcache_for_umount_subtree(dentry);
+
+	while (!hlist_empty(&sb->s_anon)) {
+		dentry = hlist_entry(sb->s_anon.first, struct dentry, d_hash);
+		shrink_dcache_for_umount_subtree(dentry);
+	}
+}
+
+/*
  * Search for at least 1 mount point in the dentry's subdirs.
  * We descend to the next level whenever the d_subdirs
  * list is non-empty and continue searching.
@@ -1003,7 +1135,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 {
 	struct dentry *new = NULL;
 
-	if (inode) {
+	if (inode && S_ISDIR(inode->i_mode)) {
 		spin_lock(&dcache_lock);
 		new = __d_find_alias(inode, 1);
 		if (new) {
@@ -1337,23 +1469,21 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
  * deleted it.
  */
  
-/**
- * d_move - move a dentry
+/*
+ * d_move_locked - move a dentry
  * @dentry: entry to move
  * @target: new dentry
  *
  * Update the dcache to reflect the move of a file name. Negative
  * dcache entries should not be moved in this way.
  */
-
-void d_move(struct dentry * dentry, struct dentry * target)
+static void d_move_locked(struct dentry * dentry, struct dentry * target)
 {
 	struct hlist_head *list;
 
 	if (!dentry->d_inode)
 		printk(KERN_WARNING "VFS: moving negative dcache entry\n");
 
-	spin_lock(&dcache_lock);
 	write_seqlock(&rename_lock);
 	/*
 	 * XXXX: do we really need to take target->d_lock?
@@ -1404,7 +1534,81 @@ already_unhashed:
 	fsnotify_d_move(dentry);
 	spin_unlock(&dentry->d_lock);
 	write_sequnlock(&rename_lock);
+}
+
+/**
+ * d_move - move a dentry
+ * @dentry: entry to move
+ * @target: new dentry
+ *
+ * Update the dcache to reflect the move of a file name. Negative
+ * dcache entries should not be moved in this way.
+ */
+
+void d_move(struct dentry * dentry, struct dentry * target)
+{
+	spin_lock(&dcache_lock);
+	d_move_locked(dentry, target);
+	spin_unlock(&dcache_lock);
+}
+
+/*
+ * Helper that returns 1 if p1 is a parent of p2, else 0
+ */
+static int d_isparent(struct dentry *p1, struct dentry *p2)
+{
+	struct dentry *p;
+
+	for (p = p2; p->d_parent != p; p = p->d_parent) {
+		if (p->d_parent == p1)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * This helper attempts to cope with remotely renamed directories
+ *
+ * It assumes that the caller is already holding
+ * dentry->d_parent->d_inode->i_mutex and the dcache_lock
+ *
+ * Note: If ever the locking in lock_rename() changes, then please
+ * remember to update this too...
+ *
+ * On return, dcache_lock will have been unlocked.
+ */
+static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
+{
+	struct mutex *m1 = NULL, *m2 = NULL;
+	struct dentry *ret;
+
+	/* If alias and dentry share a parent, then no extra locks required */
+	if (alias->d_parent == dentry->d_parent)
+		goto out_unalias;
+
+	/* Check for loops */
+	ret = ERR_PTR(-ELOOP);
+	if (d_isparent(alias, dentry))
+		goto out_err;
+
+	/* See lock_rename() */
+	ret = ERR_PTR(-EBUSY);
+	if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
+		goto out_err;
+	m1 = &dentry->d_sb->s_vfs_rename_mutex;
+	if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex))
+		goto out_err;
+	m2 = &alias->d_parent->d_inode->i_mutex;
+out_unalias:
+	d_move_locked(alias, dentry);
+	ret = alias;
+out_err:
 	spin_unlock(&dcache_lock);
+	if (m2)
+		mutex_unlock(m2);
+	if (m1)
+		mutex_unlock(m1);
+	return ret;
 }
 
 /*
@@ -1449,7 +1653,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
  */
 struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 {
-	struct dentry *alias, *actual;
+	struct dentry *actual;
 
 	BUG_ON(!d_unhashed(dentry));
 
@@ -1461,26 +1665,27 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 		goto found_lock;
 	}
 
-	/* See if a disconnected directory already exists as an anonymous root
-	 * that we should splice into the tree instead */
-	if (S_ISDIR(inode->i_mode) && (alias = __d_find_alias(inode, 1))) {
-		spin_lock(&alias->d_lock);
-
-		/* Is this a mountpoint that we could splice into our tree? */
-		if (IS_ROOT(alias))
-			goto connect_mountpoint;
-
-		if (alias->d_name.len == dentry->d_name.len &&
-		    alias->d_parent == dentry->d_parent &&
-		    memcmp(alias->d_name.name,
-			   dentry->d_name.name,
-			   dentry->d_name.len) == 0)
-			goto replace_with_alias;
-
-		spin_unlock(&alias->d_lock);
-
-		/* Doh! Seem to be aliasing directories for some reason... */
-		dput(alias);
+	if (S_ISDIR(inode->i_mode)) {
+		struct dentry *alias;
+
+		/* Does an aliased dentry already exist? */
+		alias = __d_find_alias(inode, 0);
+		if (alias) {
+			actual = alias;
+			/* Is this an anonymous mountpoint that we could splice
+			 * into our tree? */
+			if (IS_ROOT(alias)) {
+				spin_lock(&alias->d_lock);
+				__d_materialise_dentry(dentry, alias);
+				__d_drop(alias);
+				goto found;
+			}
+			/* Nope, but we must(!) avoid directory aliasing */
+			actual = __d_unalias(dentry, alias);
+			if (IS_ERR(actual))
+				dput(alias);
+			goto out_nolock;
+		}
 	}
 
 	/* Add a unique reference */
@@ -1496,7 +1701,7 @@ found:
 	_d_rehash(actual);
 	spin_unlock(&actual->d_lock);
 	spin_unlock(&dcache_lock);
-
+out_nolock:
 	if (actual == dentry) {
 		security_d_instantiate(dentry, inode);
 		return NULL;
@@ -1505,16 +1710,6 @@ found:
 	iput(inode);
 	return actual;
 
-	/* Convert the anonymous/root alias into an ordinary dentry */
-connect_mountpoint:
-	__d_materialise_dentry(dentry, alias);
-
-	/* Replace the candidate dentry with the alias in the tree */
-replace_with_alias:
-	__d_drop(alias);
-	actual = alias;
-	goto found;
-
 shouldnt_be_hashed:
 	spin_unlock(&dcache_lock);
 	BUG();
@@ -1877,9 +2072,6 @@ kmem_cache_t *filp_cachep __read_mostly;
 
 EXPORT_SYMBOL(d_genocide);
 
-extern void bdev_cache_init(void);
-extern void chrdev_init(void);
-
 void __init vfs_caches_init_early(void)
 {
 	dcache_init_early();
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index e4b430552c88..bf3901ab1744 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -32,8 +32,8 @@ static ssize_t default_write_file(struct file *file, const char __user *buf,
 
 static int default_open(struct inode *inode, struct file *file)
 {
-	if (inode->u.generic_ip)
-		file->private_data = inode->u.generic_ip;
+	if (inode->i_private)
+		file->private_data = inode->i_private;
 
 	return 0;
 }
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 3ca268d2e5a2..e77676df6713 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -40,7 +40,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
 		inode->i_mode = mode;
 		inode->i_uid = 0;
 		inode->i_gid = 0;
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		switch (mode & S_IFMT) {
@@ -55,7 +54,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
 			inode->i_fop = &simple_dir_operations;
 
 			/* directory inodes start off with i_nlink == 2 (for "." entry) */
-			inode->i_nlink++;
+			inc_nlink(inode);
 			break;
 		}
 	}
@@ -88,7 +87,7 @@ static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
 	res = debugfs_mknod(dir, dentry, mode, 0);
 	if (!res)
-		dir->i_nlink++;
+		inc_nlink(dir);
 	return res;
 }
 
@@ -168,7 +167,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
  *          directory dentry if set.  If this paramater is NULL, then the
  *          file will be created in the root of the debugfs filesystem.
  * @data: a pointer to something that the caller will want to get to later
- *        on.  The inode.u.generic_ip pointer will point to this value on
+ *        on.  The inode.i_private pointer will point to this value on
  *        the open() call.
  * @fops: a pointer to a struct file_operations that should be used for
  *        this file.
@@ -209,7 +208,7 @@ struct dentry *debugfs_create_file(const char *name, mode_t mode,
 
 	if (dentry->d_inode) {
 		if (data)
-			dentry->d_inode->u.generic_ip = data;
+			dentry->d_inode->i_private = data;
 		if (fops)
 			dentry->d_inode->i_fop = fops;
 	}
@@ -253,7 +252,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
  *
  * This function removes a file or directory in debugfs that was previously
  * created with a call to another debugfs function (like
- * debufs_create_file() or variants thereof.)
+ * debugfs_create_file() or variants thereof.)
  *
  * This function is required to be called in order for the file to be
  * removed, no automatic cleanup of files will happen when a module is
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index f7aef5bb584a..5f7b5a6025bf 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -113,7 +113,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	inode->i_ino = 1;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_blocks = 0;
-	inode->i_blksize = 1024;
 	inode->i_uid = inode->i_gid = 0;
 	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
 	inode->i_op = &simple_dir_inode_operations;
@@ -172,12 +171,11 @@ int devpts_pty_new(struct tty_struct *tty)
 		return -ENOMEM;
 
 	inode->i_ino = number+2;
-	inode->i_blksize = 1024;
 	inode->i_uid = config.setuid ? config.uid : current->fsuid;
 	inode->i_gid = config.setgid ? config.gid : current->fsgid;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	init_special_inode(inode, S_IFCHR|config.mode, device);
-	inode->u.generic_ip = tty;
+	inode->i_private = tty;
 
 	dentry = get_node(number);
 	if (!IS_ERR(dentry) && !dentry->d_inode)
@@ -196,7 +194,7 @@ struct tty_struct *devpts_get_tty(int number)
 	tty = NULL;
 	if (!IS_ERR(dentry)) {
 		if (dentry->d_inode)
-			tty = dentry->d_inode->u.generic_ip;
+			tty = dentry->d_inode->i_private;
 		dput(dentry);
 	}
 
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
new file mode 100644
index 000000000000..81b2c6465eeb
--- /dev/null
+++ b/fs/dlm/Kconfig
@@ -0,0 +1,20 @@
+menu "Distributed Lock Manager"
+	depends on INET && IP_SCTP && EXPERIMENTAL
+
+config DLM
+	tristate "Distributed Lock Manager (DLM)"
+	depends on IPV6 || IPV6=n
+	select CONFIGFS_FS
+	help
+	A general purpose distributed lock manager for kernel or userspace
+	applications.
+
+config DLM_DEBUG
+	bool "DLM debugging"
+	depends on DLM
+	help
+	Under the debugfs mount point, the name of each lockspace will
+	appear as a file in the "dlm" directory.  The output is the
+	list of resource and locks the local node knows about.
+
+endmenu
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
new file mode 100644
index 000000000000..1832e0297f7d
--- /dev/null
+++ b/fs/dlm/Makefile
@@ -0,0 +1,19 @@
+obj-$(CONFIG_DLM) +=		dlm.o
+dlm-y :=			ast.o \
+				config.o \
+				dir.o \
+				lock.o \
+				lockspace.o \
+				lowcomms.o \
+				main.o \
+				member.o \
+				memory.o \
+				midcomms.o \
+				rcom.o \
+				recover.o \
+				recoverd.o \
+				requestqueue.o \
+				user.o \
+				util.o
+dlm-$(CONFIG_DLM_DEBUG) +=	debug_fs.o
+
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
new file mode 100644
index 000000000000..f91d39cb1e0b
--- /dev/null
+++ b/fs/dlm/ast.c
@@ -0,0 +1,173 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lock.h"
+#include "user.h"
+
+#define WAKE_ASTS  0
+
+static struct list_head		ast_queue;
+static spinlock_t		ast_queue_lock;
+static struct task_struct *	astd_task;
+static unsigned long		astd_wakeflags;
+static struct mutex		astd_running;
+
+
+void dlm_del_ast(struct dlm_lkb *lkb)
+{
+	spin_lock(&ast_queue_lock);
+	if (lkb->lkb_ast_type & (AST_COMP | AST_BAST))
+		list_del(&lkb->lkb_astqueue);
+	spin_unlock(&ast_queue_lock);
+}
+
+void dlm_add_ast(struct dlm_lkb *lkb, int type)
+{
+	if (lkb->lkb_flags & DLM_IFL_USER) {
+		dlm_user_add_ast(lkb, type);
+		return;
+	}
+	DLM_ASSERT(lkb->lkb_astaddr != DLM_FAKE_USER_AST, dlm_print_lkb(lkb););
+
+	spin_lock(&ast_queue_lock);
+	if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
+		kref_get(&lkb->lkb_ref);
+		list_add_tail(&lkb->lkb_astqueue, &ast_queue);
+	}
+	lkb->lkb_ast_type |= type;
+	spin_unlock(&ast_queue_lock);
+
+	set_bit(WAKE_ASTS, &astd_wakeflags);
+	wake_up_process(astd_task);
+}
+
+static void process_asts(void)
+{
+	struct dlm_ls *ls = NULL;
+	struct dlm_rsb *r = NULL;
+	struct dlm_lkb *lkb;
+	void (*cast) (long param);
+	void (*bast) (long param, int mode);
+	int type = 0, found, bmode;
+
+	for (;;) {
+		found = 0;
+		spin_lock(&ast_queue_lock);
+		list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
+			r = lkb->lkb_resource;
+			ls = r->res_ls;
+
+			if (dlm_locking_stopped(ls))
+				continue;
+
+			list_del(&lkb->lkb_astqueue);
+			type = lkb->lkb_ast_type;
+			lkb->lkb_ast_type = 0;
+			found = 1;
+			break;
+		}
+		spin_unlock(&ast_queue_lock);
+
+		if (!found)
+			break;
+
+		cast = lkb->lkb_astaddr;
+		bast = lkb->lkb_bastaddr;
+		bmode = lkb->lkb_bastmode;
+
+		if ((type & AST_COMP) && cast)
+			cast(lkb->lkb_astparam);
+
+		/* FIXME: Is it safe to look at lkb_grmode here
+		   without doing a lock_rsb() ?
+		   Look at other checks in v1 to avoid basts. */
+
+		if ((type & AST_BAST) && bast)
+			if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
+				bast(lkb->lkb_astparam, bmode);
+
+		/* this removes the reference added by dlm_add_ast
+		   and may result in the lkb being freed */
+		dlm_put_lkb(lkb);
+
+		schedule();
+	}
+}
+
+static inline int no_asts(void)
+{
+	int ret;
+
+	spin_lock(&ast_queue_lock);
+	ret = list_empty(&ast_queue);
+	spin_unlock(&ast_queue_lock);
+	return ret;
+}
+
+static int dlm_astd(void *data)
+{
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!test_bit(WAKE_ASTS, &astd_wakeflags))
+			schedule();
+		set_current_state(TASK_RUNNING);
+
+		mutex_lock(&astd_running);
+		if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags))
+			process_asts();
+		mutex_unlock(&astd_running);
+	}
+	return 0;
+}
+
+void dlm_astd_wake(void)
+{
+	if (!no_asts()) {
+		set_bit(WAKE_ASTS, &astd_wakeflags);
+		wake_up_process(astd_task);
+	}
+}
+
+int dlm_astd_start(void)
+{
+	struct task_struct *p;
+	int error = 0;
+
+	INIT_LIST_HEAD(&ast_queue);
+	spin_lock_init(&ast_queue_lock);
+	mutex_init(&astd_running);
+
+	p = kthread_run(dlm_astd, NULL, "dlm_astd");
+	if (IS_ERR(p))
+		error = PTR_ERR(p);
+	else
+		astd_task = p;
+	return error;
+}
+
+void dlm_astd_stop(void)
+{
+	kthread_stop(astd_task);
+}
+
+void dlm_astd_suspend(void)
+{
+	mutex_lock(&astd_running);
+}
+
+void dlm_astd_resume(void)
+{
+	mutex_unlock(&astd_running);
+}
+
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
new file mode 100644
index 000000000000..6ee276c74c52
--- /dev/null
+++ b/fs/dlm/ast.h
@@ -0,0 +1,26 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __ASTD_DOT_H__
+#define __ASTD_DOT_H__
+
+void dlm_add_ast(struct dlm_lkb *lkb, int type);
+void dlm_del_ast(struct dlm_lkb *lkb);
+
+void dlm_astd_wake(void);
+int dlm_astd_start(void);
+void dlm_astd_stop(void);
+void dlm_astd_suspend(void);
+void dlm_astd_resume(void);
+
+#endif
+
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
new file mode 100644
index 000000000000..88553054bbfa
--- /dev/null
+++ b/fs/dlm/config.c
@@ -0,0 +1,789 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/configfs.h>
+#include <net/sock.h>
+
+#include "config.h"
+#include "lowcomms.h"
+
+/*
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid
+ * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight
+ * /config/dlm/<cluster>/comms/<comm>/nodeid
+ * /config/dlm/<cluster>/comms/<comm>/local
+ * /config/dlm/<cluster>/comms/<comm>/addr
+ * The <cluster> level is useless, but I haven't figured out how to avoid it.
+ */
+
+static struct config_group *space_list;
+static struct config_group *comm_list;
+static struct comm *local_comm;
+
+struct clusters;
+struct cluster;
+struct spaces;
+struct space;
+struct comms;
+struct comm;
+struct nodes;
+struct node;
+
+static struct config_group *make_cluster(struct config_group *, const char *);
+static void drop_cluster(struct config_group *, struct config_item *);
+static void release_cluster(struct config_item *);
+static struct config_group *make_space(struct config_group *, const char *);
+static void drop_space(struct config_group *, struct config_item *);
+static void release_space(struct config_item *);
+static struct config_item *make_comm(struct config_group *, const char *);
+static void drop_comm(struct config_group *, struct config_item *);
+static void release_comm(struct config_item *);
+static struct config_item *make_node(struct config_group *, const char *);
+static void drop_node(struct config_group *, struct config_item *);
+static void release_node(struct config_item *);
+
+static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
+			 char *buf);
+static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
+			  const char *buf, size_t len);
+static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
+			 char *buf);
+static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
+			  const char *buf, size_t len);
+
+static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
+static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_local_read(struct comm *cm, char *buf);
+static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
+static ssize_t node_nodeid_read(struct node *nd, char *buf);
+static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
+static ssize_t node_weight_read(struct node *nd, char *buf);
+static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
+
+enum {
+	COMM_ATTR_NODEID = 0,
+	COMM_ATTR_LOCAL,
+	COMM_ATTR_ADDR,
+};
+
+struct comm_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct comm *, char *);
+	ssize_t (*store)(struct comm *, const char *, size_t);
+};
+
+static struct comm_attribute comm_attr_nodeid = {
+	.attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "nodeid",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+	.show   = comm_nodeid_read,
+	.store  = comm_nodeid_write,
+};
+
+static struct comm_attribute comm_attr_local = {
+	.attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "local",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+	.show   = comm_local_read,
+	.store  = comm_local_write,
+};
+
+static struct comm_attribute comm_attr_addr = {
+	.attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "addr",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+	.store  = comm_addr_write,
+};
+
+static struct configfs_attribute *comm_attrs[] = {
+	[COMM_ATTR_NODEID] = &comm_attr_nodeid.attr,
+	[COMM_ATTR_LOCAL] = &comm_attr_local.attr,
+	[COMM_ATTR_ADDR] = &comm_attr_addr.attr,
+	NULL,
+};
+
+enum {
+	NODE_ATTR_NODEID = 0,
+	NODE_ATTR_WEIGHT,
+};
+
+struct node_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct node *, char *);
+	ssize_t (*store)(struct node *, const char *, size_t);
+};
+
+static struct node_attribute node_attr_nodeid = {
+	.attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "nodeid",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+	.show   = node_nodeid_read,
+	.store  = node_nodeid_write,
+};
+
+static struct node_attribute node_attr_weight = {
+	.attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "weight",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+	.show   = node_weight_read,
+	.store  = node_weight_write,
+};
+
+static struct configfs_attribute *node_attrs[] = {
+	[NODE_ATTR_NODEID] = &node_attr_nodeid.attr,
+	[NODE_ATTR_WEIGHT] = &node_attr_weight.attr,
+	NULL,
+};
+
+struct clusters {
+	struct configfs_subsystem subsys;
+};
+
+struct cluster {
+	struct config_group group;
+};
+
+struct spaces {
+	struct config_group ss_group;
+};
+
+struct space {
+	struct config_group group;
+	struct list_head members;
+	struct mutex members_lock;
+	int members_count;
+};
+
+struct comms {
+	struct config_group cs_group;
+};
+
+struct comm {
+	struct config_item item;
+	int nodeid;
+	int local;
+	int addr_count;
+	struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
+};
+
+struct nodes {
+	struct config_group ns_group;
+};
+
+struct node {
+	struct config_item item;
+	struct list_head list; /* space->members */
+	int nodeid;
+	int weight;
+};
+
+static struct configfs_group_operations clusters_ops = {
+	.make_group = make_cluster,
+	.drop_item = drop_cluster,
+};
+
+static struct configfs_item_operations cluster_ops = {
+	.release = release_cluster,
+};
+
+static struct configfs_group_operations spaces_ops = {
+	.make_group = make_space,
+	.drop_item = drop_space,
+};
+
+static struct configfs_item_operations space_ops = {
+	.release = release_space,
+};
+
+static struct configfs_group_operations comms_ops = {
+	.make_item = make_comm,
+	.drop_item = drop_comm,
+};
+
+static struct configfs_item_operations comm_ops = {
+	.release = release_comm,
+	.show_attribute = show_comm,
+	.store_attribute = store_comm,
+};
+
+static struct configfs_group_operations nodes_ops = {
+	.make_item = make_node,
+	.drop_item = drop_node,
+};
+
+static struct configfs_item_operations node_ops = {
+	.release = release_node,
+	.show_attribute = show_node,
+	.store_attribute = store_node,
+};
+
+static struct config_item_type clusters_type = {
+	.ct_group_ops = &clusters_ops,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type cluster_type = {
+	.ct_item_ops = &cluster_ops,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type spaces_type = {
+	.ct_group_ops = &spaces_ops,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type space_type = {
+	.ct_item_ops = &space_ops,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type comms_type = {
+	.ct_group_ops = &comms_ops,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type comm_type = {
+	.ct_item_ops = &comm_ops,
+	.ct_attrs = comm_attrs,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type nodes_type = {
+	.ct_group_ops = &nodes_ops,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct config_item_type node_type = {
+	.ct_item_ops = &node_ops,
+	.ct_attrs = node_attrs,
+	.ct_owner = THIS_MODULE,
+};
+
+static struct cluster *to_cluster(struct config_item *i)
+{
+	return i ? container_of(to_config_group(i), struct cluster, group):NULL;
+}
+
+static struct space *to_space(struct config_item *i)
+{
+	return i ? container_of(to_config_group(i), struct space, group) : NULL;
+}
+
+static struct comm *to_comm(struct config_item *i)
+{
+	return i ? container_of(i, struct comm, item) : NULL;
+}
+
+static struct node *to_node(struct config_item *i)
+{
+	return i ? container_of(i, struct node, item) : NULL;
+}
+
+static struct config_group *make_cluster(struct config_group *g,
+					 const char *name)
+{
+	struct cluster *cl = NULL;
+	struct spaces *sps = NULL;
+	struct comms *cms = NULL;
+	void *gps = NULL;
+
+	cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
+	gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
+	sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
+	cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
+
+	if (!cl || !gps || !sps || !cms)
+		goto fail;
+
+	config_group_init_type_name(&cl->group, name, &cluster_type);
+	config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type);
+	config_group_init_type_name(&cms->cs_group, "comms", &comms_type);
+
+	cl->group.default_groups = gps;
+	cl->group.default_groups[0] = &sps->ss_group;
+	cl->group.default_groups[1] = &cms->cs_group;
+	cl->group.default_groups[2] = NULL;
+
+	space_list = &sps->ss_group;
+	comm_list = &cms->cs_group;
+	return &cl->group;
+
+ fail:
+	kfree(cl);
+	kfree(gps);
+	kfree(sps);
+	kfree(cms);
+	return NULL;
+}
+
+static void drop_cluster(struct config_group *g, struct config_item *i)
+{
+	struct cluster *cl = to_cluster(i);
+	struct config_item *tmp;
+	int j;
+
+	for (j = 0; cl->group.default_groups[j]; j++) {
+		tmp = &cl->group.default_groups[j]->cg_item;
+		cl->group.default_groups[j] = NULL;
+		config_item_put(tmp);
+	}
+
+	space_list = NULL;
+	comm_list = NULL;
+
+	config_item_put(i);
+}
+
+static void release_cluster(struct config_item *i)
+{
+	struct cluster *cl = to_cluster(i);
+	kfree(cl->group.default_groups);
+	kfree(cl);
+}
+
+static struct config_group *make_space(struct config_group *g, const char *name)
+{
+	struct space *sp = NULL;
+	struct nodes *nds = NULL;
+	void *gps = NULL;
+
+	sp = kzalloc(sizeof(struct space), GFP_KERNEL);
+	gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
+	nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
+
+	if (!sp || !gps || !nds)
+		goto fail;
+
+	config_group_init_type_name(&sp->group, name, &space_type);
+	config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type);
+
+	sp->group.default_groups = gps;
+	sp->group.default_groups[0] = &nds->ns_group;
+	sp->group.default_groups[1] = NULL;
+
+	INIT_LIST_HEAD(&sp->members);
+	mutex_init(&sp->members_lock);
+	sp->members_count = 0;
+	return &sp->group;
+
+ fail:
+	kfree(sp);
+	kfree(gps);
+	kfree(nds);
+	return NULL;
+}
+
+static void drop_space(struct config_group *g, struct config_item *i)
+{
+	struct space *sp = to_space(i);
+	struct config_item *tmp;
+	int j;
+
+	/* assert list_empty(&sp->members) */
+
+	for (j = 0; sp->group.default_groups[j]; j++) {
+		tmp = &sp->group.default_groups[j]->cg_item;
+		sp->group.default_groups[j] = NULL;
+		config_item_put(tmp);
+	}
+
+	config_item_put(i);
+}
+
+static void release_space(struct config_item *i)
+{
+	struct space *sp = to_space(i);
+	kfree(sp->group.default_groups);
+	kfree(sp);
+}
+
+static struct config_item *make_comm(struct config_group *g, const char *name)
+{
+	struct comm *cm;
+
+	cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
+	if (!cm)
+		return NULL;
+
+	config_item_init_type_name(&cm->item, name, &comm_type);
+	cm->nodeid = -1;
+	cm->local = 0;
+	cm->addr_count = 0;
+	return &cm->item;
+}
+
+static void drop_comm(struct config_group *g, struct config_item *i)
+{
+	struct comm *cm = to_comm(i);
+	if (local_comm == cm)
+		local_comm = NULL;
+	dlm_lowcomms_close(cm->nodeid);
+	while (cm->addr_count--)
+		kfree(cm->addr[cm->addr_count]);
+	config_item_put(i);
+}
+
+static void release_comm(struct config_item *i)
+{
+	struct comm *cm = to_comm(i);
+	kfree(cm);
+}
+
+static struct config_item *make_node(struct config_group *g, const char *name)
+{
+	struct space *sp = to_space(g->cg_item.ci_parent);
+	struct node *nd;
+
+	nd = kzalloc(sizeof(struct node), GFP_KERNEL);
+	if (!nd)
+		return NULL;
+
+	config_item_init_type_name(&nd->item, name, &node_type);
+	nd->nodeid = -1;
+	nd->weight = 1;  /* default weight of 1 if none is set */
+
+	mutex_lock(&sp->members_lock);
+	list_add(&nd->list, &sp->members);
+	sp->members_count++;
+	mutex_unlock(&sp->members_lock);
+
+	return &nd->item;
+}
+
+static void drop_node(struct config_group *g, struct config_item *i)
+{
+	struct space *sp = to_space(g->cg_item.ci_parent);
+	struct node *nd = to_node(i);
+
+	mutex_lock(&sp->members_lock);
+	list_del(&nd->list);
+	sp->members_count--;
+	mutex_unlock(&sp->members_lock);
+
+	config_item_put(i);
+}
+
+static void release_node(struct config_item *i)
+{
+	struct node *nd = to_node(i);
+	kfree(nd);
+}
+
+static struct clusters clusters_root = {
+	.subsys = {
+		.su_group = {
+			.cg_item = {
+				.ci_namebuf = "dlm",
+				.ci_type = &clusters_type,
+			},
+		},
+	},
+};
+
+int dlm_config_init(void)
+{
+	config_group_init(&clusters_root.subsys.su_group);
+	init_MUTEX(&clusters_root.subsys.su_sem);
+	return configfs_register_subsystem(&clusters_root.subsys);
+}
+
+void dlm_config_exit(void)
+{
+	configfs_unregister_subsystem(&clusters_root.subsys);
+}
+
+/*
+ * Functions for user space to read/write attributes
+ */
+
+static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
+			 char *buf)
+{
+	struct comm *cm = to_comm(i);
+	struct comm_attribute *cma =
+			container_of(a, struct comm_attribute, attr);
+	return cma->show ? cma->show(cm, buf) : 0;
+}
+
+static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
+			  const char *buf, size_t len)
+{
+	struct comm *cm = to_comm(i);
+	struct comm_attribute *cma =
+		container_of(a, struct comm_attribute, attr);
+	return cma->store ? cma->store(cm, buf, len) : -EINVAL;
+}
+
+static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
+{
+	return sprintf(buf, "%d\n", cm->nodeid);
+}
+
+static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
+{
+	cm->nodeid = simple_strtol(buf, NULL, 0);
+	return len;
+}
+
+static ssize_t comm_local_read(struct comm *cm, char *buf)
+{
+	return sprintf(buf, "%d\n", cm->local);
+}
+
+static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
+{
+	cm->local= simple_strtol(buf, NULL, 0);
+	if (cm->local && !local_comm)
+		local_comm = cm;
+	return len;
+}
+
+static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
+{
+	struct sockaddr_storage *addr;
+
+	if (len != sizeof(struct sockaddr_storage))
+		return -EINVAL;
+
+	if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
+		return -ENOSPC;
+
+	addr = kzalloc(sizeof(*addr), GFP_KERNEL);
+	if (!addr)
+		return -ENOMEM;
+
+	memcpy(addr, buf, len);
+	cm->addr[cm->addr_count++] = addr;
+	return len;
+}
+
+static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
+			 char *buf)
+{
+	struct node *nd = to_node(i);
+	struct node_attribute *nda =
+			container_of(a, struct node_attribute, attr);
+	return nda->show ? nda->show(nd, buf) : 0;
+}
+
+static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
+			  const char *buf, size_t len)
+{
+	struct node *nd = to_node(i);
+	struct node_attribute *nda =
+		container_of(a, struct node_attribute, attr);
+	return nda->store ? nda->store(nd, buf, len) : -EINVAL;
+}
+
+static ssize_t node_nodeid_read(struct node *nd, char *buf)
+{
+	return sprintf(buf, "%d\n", nd->nodeid);
+}
+
+static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
+{
+	nd->nodeid = simple_strtol(buf, NULL, 0);
+	return len;
+}
+
+static ssize_t node_weight_read(struct node *nd, char *buf)
+{
+	return sprintf(buf, "%d\n", nd->weight);
+}
+
+static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
+{
+	nd->weight = simple_strtol(buf, NULL, 0);
+	return len;
+}
+
+/*
+ * Functions for the dlm to get the info that's been configured
+ */
+
+static struct space *get_space(char *name)
+{
+	if (!space_list)
+		return NULL;
+	return to_space(config_group_find_obj(space_list, name));
+}
+
+static void put_space(struct space *sp)
+{
+	config_item_put(&sp->group.cg_item);
+}
+
+static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
+{
+	struct config_item *i;
+	struct comm *cm = NULL;
+	int found = 0;
+
+	if (!comm_list)
+		return NULL;
+
+	down(&clusters_root.subsys.su_sem);
+
+	list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
+		cm = to_comm(i);
+
+		if (nodeid) {
+			if (cm->nodeid != nodeid)
+				continue;
+			found = 1;
+			break;
+		} else {
+			if (!cm->addr_count ||
+			    memcmp(cm->addr[0], addr, sizeof(*addr)))
+				continue;
+			found = 1;
+			break;
+		}
+	}
+	up(&clusters_root.subsys.su_sem);
+
+	if (found)
+		config_item_get(i);
+	else
+		cm = NULL;
+	return cm;
+}
+
+static void put_comm(struct comm *cm)
+{
+	config_item_put(&cm->item);
+}
+
+/* caller must free mem */
+int dlm_nodeid_list(char *lsname, int **ids_out)
+{
+	struct space *sp;
+	struct node *nd;
+	int i = 0, rv = 0;
+	int *ids;
+
+	sp = get_space(lsname);
+	if (!sp)
+		return -EEXIST;
+
+	mutex_lock(&sp->members_lock);
+	if (!sp->members_count) {
+		rv = 0;
+		goto out;
+	}
+
+	ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL);
+	if (!ids) {
+		rv = -ENOMEM;
+		goto out;
+	}
+
+	rv = sp->members_count;
+	list_for_each_entry(nd, &sp->members, list)
+		ids[i++] = nd->nodeid;
+
+	if (rv != i)
+		printk("bad nodeid count %d %d\n", rv, i);
+
+	*ids_out = ids;
+ out:
+	mutex_unlock(&sp->members_lock);
+	put_space(sp);
+	return rv;
+}
+
+int dlm_node_weight(char *lsname, int nodeid)
+{
+	struct space *sp;
+	struct node *nd;
+	int w = -EEXIST;
+
+	sp = get_space(lsname);
+	if (!sp)
+		goto out;
+
+	mutex_lock(&sp->members_lock);
+	list_for_each_entry(nd, &sp->members, list) {
+		if (nd->nodeid != nodeid)
+			continue;
+		w = nd->weight;
+		break;
+	}
+	mutex_unlock(&sp->members_lock);
+	put_space(sp);
+ out:
+	return w;
+}
+
+int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
+{
+	struct comm *cm = get_comm(nodeid, NULL);
+	if (!cm)
+		return -EEXIST;
+	if (!cm->addr_count)
+		return -ENOENT;
+	memcpy(addr, cm->addr[0], sizeof(*addr));
+	put_comm(cm);
+	return 0;
+}
+
+int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
+{
+	struct comm *cm = get_comm(0, addr);
+	if (!cm)
+		return -EEXIST;
+	*nodeid = cm->nodeid;
+	put_comm(cm);
+	return 0;
+}
+
+int dlm_our_nodeid(void)
+{
+	return local_comm ? local_comm->nodeid : 0;
+}
+
+/* num 0 is first addr, num 1 is second addr */
+int dlm_our_addr(struct sockaddr_storage *addr, int num)
+{
+	if (!local_comm)
+		return -1;
+	if (num + 1 > local_comm->addr_count)
+		return -1;
+	memcpy(addr, local_comm->addr[num], sizeof(*addr));
+	return 0;
+}
+
+/* Config file defaults */
+#define DEFAULT_TCP_PORT       21064
+#define DEFAULT_BUFFER_SIZE     4096
+#define DEFAULT_RSBTBL_SIZE      256
+#define DEFAULT_LKBTBL_SIZE     1024
+#define DEFAULT_DIRTBL_SIZE      512
+#define DEFAULT_RECOVER_TIMER      5
+#define DEFAULT_TOSS_SECS         10
+#define DEFAULT_SCAN_SECS          5
+
+struct dlm_config_info dlm_config = {
+	.tcp_port = DEFAULT_TCP_PORT,
+	.buffer_size = DEFAULT_BUFFER_SIZE,
+	.rsbtbl_size = DEFAULT_RSBTBL_SIZE,
+	.lkbtbl_size = DEFAULT_LKBTBL_SIZE,
+	.dirtbl_size = DEFAULT_DIRTBL_SIZE,
+	.recover_timer = DEFAULT_RECOVER_TIMER,
+	.toss_secs = DEFAULT_TOSS_SECS,
+	.scan_secs = DEFAULT_SCAN_SECS
+};
+
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
new file mode 100644
index 000000000000..9da7839958a9
--- /dev/null
+++ b/fs/dlm/config.h
@@ -0,0 +1,42 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __CONFIG_DOT_H__
+#define __CONFIG_DOT_H__
+
+#define DLM_MAX_ADDR_COUNT 3
+
+struct dlm_config_info {
+	int tcp_port;
+	int buffer_size;
+	int rsbtbl_size;
+	int lkbtbl_size;
+	int dirtbl_size;
+	int recover_timer;
+	int toss_secs;
+	int scan_secs;
+};
+
+extern struct dlm_config_info dlm_config;
+
+int dlm_config_init(void);
+void dlm_config_exit(void);
+int dlm_node_weight(char *lsname, int nodeid);
+int dlm_nodeid_list(char *lsname, int **ids_out);
+int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
+int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
+int dlm_our_nodeid(void);
+int dlm_our_addr(struct sockaddr_storage *addr, int num);
+
+#endif				/* __CONFIG_DOT_H__ */
+
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
new file mode 100644
index 000000000000..ca94a837a5bb
--- /dev/null
+++ b/fs/dlm/debug_fs.c
@@ -0,0 +1,387 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+
+#include "dlm_internal.h"
+
+#define DLM_DEBUG_BUF_LEN 4096
+static char debug_buf[DLM_DEBUG_BUF_LEN];
+static struct mutex debug_buf_lock;
+
+static struct dentry *dlm_root;
+
+struct rsb_iter {
+	int entry;
+	struct dlm_ls *ls;
+	struct list_head *next;
+	struct dlm_rsb *rsb;
+};
+
+/*
+ * dump all rsb's in the lockspace hash table
+ */
+
+static char *print_lockmode(int mode)
+{
+	switch (mode) {
+	case DLM_LOCK_IV:
+		return "--";
+	case DLM_LOCK_NL:
+		return "NL";
+	case DLM_LOCK_CR:
+		return "CR";
+	case DLM_LOCK_CW:
+		return "CW";
+	case DLM_LOCK_PR:
+		return "PR";
+	case DLM_LOCK_PW:
+		return "PW";
+	case DLM_LOCK_EX:
+		return "EX";
+	default:
+		return "??";
+	}
+}
+
+static void print_lock(struct seq_file *s, struct dlm_lkb *lkb,
+		       struct dlm_rsb *res)
+{
+	seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
+
+	if (lkb->lkb_status == DLM_LKSTS_CONVERT
+	    || lkb->lkb_status == DLM_LKSTS_WAITING)
+		seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
+
+	if (lkb->lkb_nodeid) {
+		if (lkb->lkb_nodeid != res->res_nodeid)
+			seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid,
+				   lkb->lkb_remid);
+		else
+			seq_printf(s, " Master:     %08x", lkb->lkb_remid);
+	}
+
+	if (lkb->lkb_wait_type)
+		seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
+
+	seq_printf(s, "\n");
+}
+
+static int print_resource(struct dlm_rsb *res, struct seq_file *s)
+{
+	struct dlm_lkb *lkb;
+	int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
+
+	seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
+	for (i = 0; i < res->res_length; i++) {
+		if (isprint(res->res_name[i]))
+			seq_printf(s, "%c", res->res_name[i]);
+		else
+			seq_printf(s, "%c", '.');
+	}
+	if (res->res_nodeid > 0)
+		seq_printf(s, "\"  \nLocal Copy, Master is node %d\n",
+			   res->res_nodeid);
+	else if (res->res_nodeid == 0)
+		seq_printf(s, "\"  \nMaster Copy\n");
+	else if (res->res_nodeid == -1)
+		seq_printf(s, "\"  \nLooking up master (lkid %x)\n",
+			   res->res_first_lkid);
+	else
+		seq_printf(s, "\"  \nInvalid master %d\n", res->res_nodeid);
+
+	/* Print the LVB: */
+	if (res->res_lvbptr) {
+		seq_printf(s, "LVB: ");
+		for (i = 0; i < lvblen; i++) {
+			if (i == lvblen / 2)
+				seq_printf(s, "\n     ");
+			seq_printf(s, "%02x ",
+				   (unsigned char) res->res_lvbptr[i]);
+		}
+		if (rsb_flag(res, RSB_VALNOTVALID))
+			seq_printf(s, " (INVALID)");
+		seq_printf(s, "\n");
+	}
+
+	root_list = !list_empty(&res->res_root_list);
+	recover_list = !list_empty(&res->res_recover_list);
+
+	if (root_list || recover_list) {
+		seq_printf(s, "Recovery: root %d recover %d flags %lx "
+			   "count %d\n", root_list, recover_list,
+			   res->res_flags, res->res_recover_locks_count);
+	}
+
+	/* Print the locks attached to this resource */
+	seq_printf(s, "Granted Queue\n");
+	list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
+		print_lock(s, lkb, res);
+
+	seq_printf(s, "Conversion Queue\n");
+	list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
+		print_lock(s, lkb, res);
+
+	seq_printf(s, "Waiting Queue\n");
+	list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
+		print_lock(s, lkb, res);
+
+	if (list_empty(&res->res_lookup))
+		goto out;
+
+	seq_printf(s, "Lookup Queue\n");
+	list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
+		seq_printf(s, "%08x %s", lkb->lkb_id,
+			   print_lockmode(lkb->lkb_rqmode));
+		if (lkb->lkb_wait_type)
+			seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
+		seq_printf(s, "\n");
+	}
+ out:
+	return 0;
+}
+
+static int rsb_iter_next(struct rsb_iter *ri)
+{
+	struct dlm_ls *ls = ri->ls;
+	int i;
+
+	if (!ri->next) {
+ top:
+		/* Find the next non-empty hash bucket */
+		for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
+			read_lock(&ls->ls_rsbtbl[i].lock);
+			if (!list_empty(&ls->ls_rsbtbl[i].list)) {
+				ri->next = ls->ls_rsbtbl[i].list.next;
+				read_unlock(&ls->ls_rsbtbl[i].lock);
+				break;
+			}
+			read_unlock(&ls->ls_rsbtbl[i].lock);
+                }
+		ri->entry = i;
+
+		if (ri->entry >= ls->ls_rsbtbl_size)
+			return 1;
+	} else {
+		i = ri->entry;
+		read_lock(&ls->ls_rsbtbl[i].lock);
+		ri->next = ri->next->next;
+		if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
+			/* End of list - move to next bucket */
+			ri->next = NULL;
+			ri->entry++;
+			read_unlock(&ls->ls_rsbtbl[i].lock);
+			goto top;
+                }
+		read_unlock(&ls->ls_rsbtbl[i].lock);
+	}
+	ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
+
+	return 0;
+}
+
+static void rsb_iter_free(struct rsb_iter *ri)
+{
+	kfree(ri);
+}
+
+static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
+{
+	struct rsb_iter *ri;
+
+	ri = kmalloc(sizeof *ri, GFP_KERNEL);
+	if (!ri)
+		return NULL;
+
+	ri->ls = ls;
+	ri->entry = 0;
+	ri->next = NULL;
+
+	if (rsb_iter_next(ri)) {
+		rsb_iter_free(ri);
+		return NULL;
+	}
+
+	return ri;
+}
+
+static void *rsb_seq_start(struct seq_file *file, loff_t *pos)
+{
+	struct rsb_iter *ri;
+	loff_t n = *pos;
+
+	ri = rsb_iter_init(file->private);
+	if (!ri)
+		return NULL;
+
+	while (n--) {
+		if (rsb_iter_next(ri)) {
+			rsb_iter_free(ri);
+			return NULL;
+		}
+	}
+
+	return ri;
+}
+
+static void *rsb_seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
+{
+	struct rsb_iter *ri = iter_ptr;
+
+	(*pos)++;
+
+	if (rsb_iter_next(ri)) {
+		rsb_iter_free(ri);
+		return NULL;
+	}
+
+	return ri;
+}
+
+static void rsb_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+	/* nothing for now */
+}
+
+static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
+{
+	struct rsb_iter *ri = iter_ptr;
+
+	print_resource(ri->rsb, file);
+
+	return 0;
+}
+
+static struct seq_operations rsb_seq_ops = {
+	.start = rsb_seq_start,
+	.next  = rsb_seq_next,
+	.stop  = rsb_seq_stop,
+	.show  = rsb_seq_show,
+};
+
+static int rsb_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int ret;
+
+	ret = seq_open(file, &rsb_seq_ops);
+	if (ret)
+		return ret;
+
+	seq = file->private_data;
+	seq->private = inode->i_private;
+
+	return 0;
+}
+
+static struct file_operations rsb_fops = {
+	.owner   = THIS_MODULE,
+	.open    = rsb_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+/*
+ * dump lkb's on the ls_waiters list
+ */
+
+static int waiters_open(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return 0;
+}
+
+static ssize_t waiters_read(struct file *file, char __user *userbuf,
+			    size_t count, loff_t *ppos)
+{
+	struct dlm_ls *ls = file->private_data;
+	struct dlm_lkb *lkb;
+	size_t len = DLM_DEBUG_BUF_LEN, pos = 0, ret, rv;
+
+	mutex_lock(&debug_buf_lock);
+	mutex_lock(&ls->ls_waiters_mutex);
+	memset(debug_buf, 0, sizeof(debug_buf));
+
+	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+		ret = snprintf(debug_buf + pos, len - pos, "%x %d %d %s\n",
+			       lkb->lkb_id, lkb->lkb_wait_type,
+			       lkb->lkb_nodeid, lkb->lkb_resource->res_name);
+		if (ret >= len - pos)
+			break;
+		pos += ret;
+	}
+	mutex_unlock(&ls->ls_waiters_mutex);
+
+	rv = simple_read_from_buffer(userbuf, count, ppos, debug_buf, pos);
+	mutex_unlock(&debug_buf_lock);
+	return rv;
+}
+
+static struct file_operations waiters_fops = {
+	.owner   = THIS_MODULE,
+	.open    = waiters_open,
+	.read    = waiters_read
+};
+
+int dlm_create_debug_file(struct dlm_ls *ls)
+{
+	char name[DLM_LOCKSPACE_LEN+8];
+
+	ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
+						      S_IFREG | S_IRUGO,
+						      dlm_root,
+						      ls,
+						      &rsb_fops);
+	if (!ls->ls_debug_rsb_dentry)
+		return -ENOMEM;
+
+	memset(name, 0, sizeof(name));
+	snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
+
+	ls->ls_debug_waiters_dentry = debugfs_create_file(name,
+							  S_IFREG | S_IRUGO,
+							  dlm_root,
+							  ls,
+							  &waiters_fops);
+	if (!ls->ls_debug_waiters_dentry) {
+		debugfs_remove(ls->ls_debug_rsb_dentry);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void dlm_delete_debug_file(struct dlm_ls *ls)
+{
+	if (ls->ls_debug_rsb_dentry)
+		debugfs_remove(ls->ls_debug_rsb_dentry);
+	if (ls->ls_debug_waiters_dentry)
+		debugfs_remove(ls->ls_debug_waiters_dentry);
+}
+
+int dlm_register_debugfs(void)
+{
+	mutex_init(&debug_buf_lock);
+	dlm_root = debugfs_create_dir("dlm", NULL);
+	return dlm_root ? 0 : -ENOMEM;
+}
+
+void dlm_unregister_debugfs(void)
+{
+	debugfs_remove(dlm_root);
+}
+
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
new file mode 100644
index 000000000000..46754553fdcc
--- /dev/null
+++ b/fs/dlm/dir.c
@@ -0,0 +1,423 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "lowcomms.h"
+#include "rcom.h"
+#include "config.h"
+#include "memory.h"
+#include "recover.h"
+#include "util.h"
+#include "lock.h"
+#include "dir.h"
+
+
+static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de)
+{
+	spin_lock(&ls->ls_recover_list_lock);
+	list_add(&de->list, &ls->ls_recover_list);
+	spin_unlock(&ls->ls_recover_list_lock);
+}
+
+static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
+{
+	int found = 0;
+	struct dlm_direntry *de;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	list_for_each_entry(de, &ls->ls_recover_list, list) {
+		if (de->length == len) {
+			list_del(&de->list);
+			de->master_nodeid = 0;
+			memset(de->name, 0, len);
+			found = 1;
+			break;
+		}
+	}
+	spin_unlock(&ls->ls_recover_list_lock);
+
+	if (!found)
+		de = allocate_direntry(ls, len);
+	return de;
+}
+
+void dlm_clear_free_entries(struct dlm_ls *ls)
+{
+	struct dlm_direntry *de;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	while (!list_empty(&ls->ls_recover_list)) {
+		de = list_entry(ls->ls_recover_list.next, struct dlm_direntry,
+				list);
+		list_del(&de->list);
+		free_direntry(de);
+	}
+	spin_unlock(&ls->ls_recover_list_lock);
+}
+
+/*
+ * We use the upper 16 bits of the hash value to select the directory node.
+ * Low bits are used for distribution of rsb's among hash buckets on each node.
+ *
+ * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of
+ * num_nodes to the hash value.  This value in the desired range is used as an
+ * offset into the sorted list of nodeid's to give the particular nodeid.
+ */
+
+int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash)
+{
+	struct list_head *tmp;
+	struct dlm_member *memb = NULL;
+	uint32_t node, n = 0;
+	int nodeid;
+
+	if (ls->ls_num_nodes == 1) {
+		nodeid = dlm_our_nodeid();
+		goto out;
+	}
+
+	if (ls->ls_node_array) {
+		node = (hash >> 16) % ls->ls_total_weight;
+		nodeid = ls->ls_node_array[node];
+		goto out;
+	}
+
+	/* make_member_array() failed to kmalloc ls_node_array... */
+
+	node = (hash >> 16) % ls->ls_num_nodes;
+
+	list_for_each(tmp, &ls->ls_nodes) {
+		if (n++ != node)
+			continue;
+		memb = list_entry(tmp, struct dlm_member, list);
+		break;
+	}
+
+	DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n",
+				 ls->ls_num_nodes, n, node););
+	nodeid = memb->nodeid;
+ out:
+	return nodeid;
+}
+
+int dlm_dir_nodeid(struct dlm_rsb *r)
+{
+	return dlm_hash2nodeid(r->res_ls, r->res_hash);
+}
+
+static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len)
+{
+	uint32_t val;
+
+	val = jhash(name, len, 0);
+	val &= (ls->ls_dirtbl_size - 1);
+
+	return val;
+}
+
+static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de)
+{
+	uint32_t bucket;
+
+	bucket = dir_hash(ls, de->name, de->length);
+	list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
+}
+
+static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name,
+					  int namelen, uint32_t bucket)
+{
+	struct dlm_direntry *de;
+
+	list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) {
+		if (de->length == namelen && !memcmp(name, de->name, namelen))
+			goto out;
+	}
+	de = NULL;
+ out:
+	return de;
+}
+
+void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen)
+{
+	struct dlm_direntry *de;
+	uint32_t bucket;
+
+	bucket = dir_hash(ls, name, namelen);
+
+	write_lock(&ls->ls_dirtbl[bucket].lock);
+
+	de = search_bucket(ls, name, namelen, bucket);
+
+	if (!de) {
+		log_error(ls, "remove fr %u none", nodeid);
+		goto out;
+	}
+
+	if (de->master_nodeid != nodeid) {
+		log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid);
+		goto out;
+	}
+
+	list_del(&de->list);
+	free_direntry(de);
+ out:
+	write_unlock(&ls->ls_dirtbl[bucket].lock);
+}
+
+void dlm_dir_clear(struct dlm_ls *ls)
+{
+	struct list_head *head;
+	struct dlm_direntry *de;
+	int i;
+
+	DLM_ASSERT(list_empty(&ls->ls_recover_list), );
+
+	for (i = 0; i < ls->ls_dirtbl_size; i++) {
+		write_lock(&ls->ls_dirtbl[i].lock);
+		head = &ls->ls_dirtbl[i].list;
+		while (!list_empty(head)) {
+			de = list_entry(head->next, struct dlm_direntry, list);
+			list_del(&de->list);
+			put_free_de(ls, de);
+		}
+		write_unlock(&ls->ls_dirtbl[i].lock);
+	}
+}
+
+int dlm_recover_directory(struct dlm_ls *ls)
+{
+	struct dlm_member *memb;
+	struct dlm_direntry *de;
+	char *b, *last_name = NULL;
+	int error = -ENOMEM, last_len, count = 0;
+	uint16_t namelen;
+
+	log_debug(ls, "dlm_recover_directory");
+
+	if (dlm_no_directory(ls))
+		goto out_status;
+
+	dlm_dir_clear(ls);
+
+	last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL);
+	if (!last_name)
+		goto out;
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		memset(last_name, 0, DLM_RESNAME_MAXLEN);
+		last_len = 0;
+
+		for (;;) {
+			error = dlm_recovery_stopped(ls);
+			if (error)
+				goto out_free;
+
+			error = dlm_rcom_names(ls, memb->nodeid,
+					       last_name, last_len);
+			if (error)
+				goto out_free;
+
+			schedule();
+
+			/*
+			 * pick namelen/name pairs out of received buffer
+			 */
+
+			b = ls->ls_recover_buf + sizeof(struct dlm_rcom);
+
+			for (;;) {
+				memcpy(&namelen, b, sizeof(uint16_t));
+				namelen = be16_to_cpu(namelen);
+				b += sizeof(uint16_t);
+
+				/* namelen of 0xFFFFF marks end of names for
+				   this node; namelen of 0 marks end of the
+				   buffer */
+
+				if (namelen == 0xFFFF)
+					goto done;
+				if (!namelen)
+					break;
+
+				error = -ENOMEM;
+				de = get_free_de(ls, namelen);
+				if (!de)
+					goto out_free;
+
+				de->master_nodeid = memb->nodeid;
+				de->length = namelen;
+				last_len = namelen;
+				memcpy(de->name, b, namelen);
+				memcpy(last_name, b, namelen);
+				b += namelen;
+
+				add_entry_to_hash(ls, de);
+				count++;
+			}
+		}
+         done:
+		;
+	}
+
+ out_status:
+	error = 0;
+	dlm_set_recover_status(ls, DLM_RS_DIR);
+	log_debug(ls, "dlm_recover_directory %d entries", count);
+ out_free:
+	kfree(last_name);
+ out:
+	dlm_clear_free_entries(ls);
+	return error;
+}
+
+static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
+		     int namelen, int *r_nodeid)
+{
+	struct dlm_direntry *de, *tmp;
+	uint32_t bucket;
+
+	bucket = dir_hash(ls, name, namelen);
+
+	write_lock(&ls->ls_dirtbl[bucket].lock);
+	de = search_bucket(ls, name, namelen, bucket);
+	if (de) {
+		*r_nodeid = de->master_nodeid;
+		write_unlock(&ls->ls_dirtbl[bucket].lock);
+		if (*r_nodeid == nodeid)
+			return -EEXIST;
+		return 0;
+	}
+
+	write_unlock(&ls->ls_dirtbl[bucket].lock);
+
+	de = allocate_direntry(ls, namelen);
+	if (!de)
+		return -ENOMEM;
+
+	de->master_nodeid = nodeid;
+	de->length = namelen;
+	memcpy(de->name, name, namelen);
+
+	write_lock(&ls->ls_dirtbl[bucket].lock);
+	tmp = search_bucket(ls, name, namelen, bucket);
+	if (tmp) {
+		free_direntry(de);
+		de = tmp;
+	} else {
+		list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
+	}
+	*r_nodeid = de->master_nodeid;
+	write_unlock(&ls->ls_dirtbl[bucket].lock);
+	return 0;
+}
+
+int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
+		   int *r_nodeid)
+{
+	return get_entry(ls, nodeid, name, namelen, r_nodeid);
+}
+
+/* Copy the names of master rsb's into the buffer provided.
+   Only select names whose dir node is the given nodeid. */
+
+void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
+ 			   char *outbuf, int outlen, int nodeid)
+{
+	struct list_head *list;
+	struct dlm_rsb *start_r = NULL, *r = NULL;
+	int offset = 0, start_namelen, error, dir_nodeid;
+	char *start_name;
+	uint16_t be_namelen;
+
+	/*
+	 * Find the rsb where we left off (or start again)
+	 */
+
+	start_namelen = inlen;
+	start_name = inbuf;
+
+	if (start_namelen > 1) {
+		/*
+		 * We could also use a find_rsb_root() function here that
+		 * searched the ls_root_list.
+		 */
+		error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER,
+				     &start_r);
+		DLM_ASSERT(!error && start_r,
+			   printk("error %d\n", error););
+		DLM_ASSERT(!list_empty(&start_r->res_root_list),
+			   dlm_print_rsb(start_r););
+		dlm_put_rsb(start_r);
+	}
+
+	/*
+	 * Send rsb names for rsb's we're master of and whose directory node
+	 * matches the requesting node.
+	 */
+
+	down_read(&ls->ls_root_sem);
+	if (start_r)
+		list = start_r->res_root_list.next;
+	else
+		list = ls->ls_root_list.next;
+
+	for (offset = 0; list != &ls->ls_root_list; list = list->next) {
+		r = list_entry(list, struct dlm_rsb, res_root_list);
+		if (r->res_nodeid)
+			continue;
+
+		dir_nodeid = dlm_dir_nodeid(r);
+		if (dir_nodeid != nodeid)
+			continue;
+
+		/*
+		 * The block ends when we can't fit the following in the
+		 * remaining buffer space:
+		 * namelen (uint16_t) +
+		 * name (r->res_length) +
+		 * end-of-block record 0x0000 (uint16_t)
+		 */
+
+		if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
+			/* Write end-of-block record */
+			be_namelen = 0;
+			memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+			offset += sizeof(uint16_t);
+			goto out;
+		}
+
+		be_namelen = cpu_to_be16(r->res_length);
+		memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+		offset += sizeof(uint16_t);
+		memcpy(outbuf + offset, r->res_name, r->res_length);
+		offset += r->res_length;
+	}
+
+	/*
+	 * If we've reached the end of the list (and there's room) write a
+	 * terminating record.
+	 */
+
+	if ((list == &ls->ls_root_list) &&
+	    (offset + sizeof(uint16_t) <= outlen)) {
+		be_namelen = 0xFFFF;
+		memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
+		offset += sizeof(uint16_t);
+	}
+
+ out:
+	up_read(&ls->ls_root_sem);
+}
+
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
new file mode 100644
index 000000000000..0b0eb1267b6e
--- /dev/null
+++ b/fs/dlm/dir.h
@@ -0,0 +1,30 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DIR_DOT_H__
+#define __DIR_DOT_H__
+
+
+int dlm_dir_nodeid(struct dlm_rsb *rsb);
+int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
+void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len);
+void dlm_dir_clear(struct dlm_ls *ls);
+void dlm_clear_free_entries(struct dlm_ls *ls);
+int dlm_recover_directory(struct dlm_ls *ls);
+int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen,
+	int *r_nodeid);
+void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
+	char *outbuf, int outlen, int nodeid);
+
+#endif				/* __DIR_DOT_H__ */
+
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
new file mode 100644
index 000000000000..1e5cd67e1b7a
--- /dev/null
+++ b/fs/dlm/dlm_internal.h
@@ -0,0 +1,543 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __DLM_INTERNAL_DOT_H__
+#define __DLM_INTERNAL_DOT_H__
+
+/*
+ * This is the main header file to be included in each DLM source file.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/socket.h>
+#include <linux/kthread.h>
+#include <linux/kobject.h>
+#include <linux/kref.h>
+#include <linux/kernel.h>
+#include <linux/jhash.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <asm/semaphore.h>
+#include <asm/uaccess.h>
+
+#include <linux/dlm.h>
+
+#define DLM_LOCKSPACE_LEN	64
+
+/* Size of the temp buffer midcomms allocates on the stack.
+   We try to make this large enough so most messages fit.
+   FIXME: should sctp make this unnecessary? */
+
+#define DLM_INBUF_LEN		148
+
+struct dlm_ls;
+struct dlm_lkb;
+struct dlm_rsb;
+struct dlm_member;
+struct dlm_lkbtable;
+struct dlm_rsbtable;
+struct dlm_dirtable;
+struct dlm_direntry;
+struct dlm_recover;
+struct dlm_header;
+struct dlm_message;
+struct dlm_rcom;
+struct dlm_mhandle;
+
+#define log_print(fmt, args...) \
+	printk(KERN_ERR "dlm: "fmt"\n" , ##args)
+#define log_error(ls, fmt, args...) \
+	printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
+
+#define DLM_LOG_DEBUG
+#ifdef DLM_LOG_DEBUG
+#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args)
+#else
+#define log_debug(ls, fmt, args...)
+#endif
+
+#define DLM_ASSERT(x, do) \
+{ \
+  if (!(x)) \
+  { \
+    printk(KERN_ERR "\nDLM:  Assertion failed on line %d of file %s\n" \
+               "DLM:  assertion:  \"%s\"\n" \
+               "DLM:  time = %lu\n", \
+               __LINE__, __FILE__, #x, jiffies); \
+    {do} \
+    printk("\n"); \
+    BUG(); \
+    panic("DLM:  Record message above and reboot.\n"); \
+  } \
+}
+
+#define DLM_FAKE_USER_AST ERR_PTR(-EINVAL)
+
+
+struct dlm_direntry {
+	struct list_head	list;
+	uint32_t		master_nodeid;
+	uint16_t		length;
+	char			name[1];
+};
+
+struct dlm_dirtable {
+	struct list_head	list;
+	rwlock_t		lock;
+};
+
+struct dlm_rsbtable {
+	struct list_head	list;
+	struct list_head	toss;
+	rwlock_t		lock;
+};
+
+struct dlm_lkbtable {
+	struct list_head	list;
+	rwlock_t		lock;
+	uint16_t		counter;
+};
+
+/*
+ * Lockspace member (per node in a ls)
+ */
+
+struct dlm_member {
+	struct list_head	list;
+	int			nodeid;
+	int			weight;
+};
+
+/*
+ * Save and manage recovery state for a lockspace.
+ */
+
+struct dlm_recover {
+	struct list_head	list;
+	int			*nodeids;
+	int			node_count;
+	uint64_t		seq;
+};
+
+/*
+ * Pass input args to second stage locking function.
+ */
+
+struct dlm_args {
+	uint32_t		flags;
+	void			*astaddr;
+	long			astparam;
+	void			*bastaddr;
+	int			mode;
+	struct dlm_lksb		*lksb;
+};
+
+
+/*
+ * Lock block
+ *
+ * A lock can be one of three types:
+ *
+ * local copy      lock is mastered locally
+ *                 (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set)
+ * process copy    lock is mastered on a remote node
+ *                 (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set)
+ * master copy     master node's copy of a lock owned by remote node
+ *                 (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set)
+ *
+ * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or
+ * dlm_unlock.  The dlm does not modify these or use any private flags in
+ * this field; it only contains DLM_LKF_ flags from dlm.h.  These flags
+ * are sent as-is to the remote master when the lock is remote.
+ *
+ * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h.
+ * Some internal flags are shared between the master and process nodes;
+ * these shared flags are kept in the lower two bytes.  One of these
+ * flags set on the master copy will be propagated to the process copy
+ * and v.v.  Other internal flags are private to the master or process
+ * node (e.g. DLM_IFL_MSTCPY).  These are kept in the high two bytes.
+ *
+ * lkb_sbflags: status block flags.  These flags are copied directly into
+ * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion
+ * ast.  All defined in dlm.h with DLM_SBF_ prefix.
+ *
+ * lkb_status: the lock status indicates which rsb queue the lock is
+ * on, grant, convert, or wait.  DLM_LKSTS_ WAITING/GRANTED/CONVERT
+ *
+ * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a
+ * reply is needed.  Only set when the lkb is on the lockspace waiters
+ * list awaiting a reply from a remote node.
+ *
+ * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb
+ * is a master copy, nodeid specifies the remote lock holder, when the
+ * lkb is a process copy, the nodeid specifies the lock master.
+ */
+
+/* lkb_ast_type */
+
+#define AST_COMP		1
+#define AST_BAST		2
+
+/* lkb_status */
+
+#define DLM_LKSTS_WAITING	1
+#define DLM_LKSTS_GRANTED	2
+#define DLM_LKSTS_CONVERT	3
+
+/* lkb_flags */
+
+#define DLM_IFL_MSTCPY		0x00010000
+#define DLM_IFL_RESEND		0x00020000
+#define DLM_IFL_DEAD		0x00040000
+#define DLM_IFL_USER		0x00000001
+#define DLM_IFL_ORPHAN		0x00000002
+
+struct dlm_lkb {
+	struct dlm_rsb		*lkb_resource;	/* the rsb */
+	struct kref		lkb_ref;
+	int			lkb_nodeid;	/* copied from rsb */
+	int			lkb_ownpid;	/* pid of lock owner */
+	uint32_t		lkb_id;		/* our lock ID */
+	uint32_t		lkb_remid;	/* lock ID on remote partner */
+	uint32_t		lkb_exflags;	/* external flags from caller */
+	uint32_t		lkb_sbflags;	/* lksb flags */
+	uint32_t		lkb_flags;	/* internal flags */
+	uint32_t		lkb_lvbseq;	/* lvb sequence number */
+
+	int8_t			lkb_status;     /* granted, waiting, convert */
+	int8_t			lkb_rqmode;	/* requested lock mode */
+	int8_t			lkb_grmode;	/* granted lock mode */
+	int8_t			lkb_bastmode;	/* requested mode */
+	int8_t			lkb_highbast;	/* highest mode bast sent for */
+
+	int8_t			lkb_wait_type;	/* type of reply waiting for */
+	int8_t			lkb_ast_type;	/* type of ast queued for */
+
+	struct list_head	lkb_idtbl_list;	/* lockspace lkbtbl */
+	struct list_head	lkb_statequeue;	/* rsb g/c/w list */
+	struct list_head	lkb_rsb_lookup;	/* waiting for rsb lookup */
+	struct list_head	lkb_wait_reply;	/* waiting for remote reply */
+	struct list_head	lkb_astqueue;	/* need ast to be sent */
+	struct list_head	lkb_ownqueue;	/* list of locks for a process */
+
+	char			*lkb_lvbptr;
+	struct dlm_lksb		*lkb_lksb;      /* caller's status block */
+	void			*lkb_astaddr;	/* caller's ast function */
+	void			*lkb_bastaddr;	/* caller's bast function */
+	long			lkb_astparam;	/* caller's ast arg */
+};
+
+
+struct dlm_rsb {
+	struct dlm_ls		*res_ls;	/* the lockspace */
+	struct kref		res_ref;
+	struct mutex		res_mutex;
+	unsigned long		res_flags;
+	int			res_length;	/* length of rsb name */
+	int			res_nodeid;
+	uint32_t                res_lvbseq;
+	uint32_t		res_hash;
+	uint32_t		res_bucket;	/* rsbtbl */
+	unsigned long		res_toss_time;
+	uint32_t		res_first_lkid;
+	struct list_head	res_lookup;	/* lkbs waiting on first */
+	struct list_head	res_hashchain;	/* rsbtbl */
+	struct list_head	res_grantqueue;
+	struct list_head	res_convertqueue;
+	struct list_head	res_waitqueue;
+
+	struct list_head	res_root_list;	    /* used for recovery */
+	struct list_head	res_recover_list;   /* used for recovery */
+	int			res_recover_locks_count;
+
+	char			*res_lvbptr;
+	char			res_name[1];
+};
+
+/* find_rsb() flags */
+
+#define R_MASTER		1	/* only return rsb if it's a master */
+#define R_CREATE		2	/* create/add rsb if not found */
+
+/* rsb_flags */
+
+enum rsb_flags {
+	RSB_MASTER_UNCERTAIN,
+	RSB_VALNOTVALID,
+	RSB_VALNOTVALID_PREV,
+	RSB_NEW_MASTER,
+	RSB_NEW_MASTER2,
+	RSB_RECOVER_CONVERT,
+	RSB_LOCKS_PURGED,
+};
+
+static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+	__set_bit(flag, &r->res_flags);
+}
+
+static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+	__clear_bit(flag, &r->res_flags);
+}
+
+static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
+{
+	return test_bit(flag, &r->res_flags);
+}
+
+
+/* dlm_header is first element of all structs sent between nodes */
+
+#define DLM_HEADER_MAJOR	0x00020000
+#define DLM_HEADER_MINOR	0x00000001
+
+#define DLM_MSG			1
+#define DLM_RCOM		2
+
+struct dlm_header {
+	uint32_t		h_version;
+	uint32_t		h_lockspace;
+	uint32_t		h_nodeid;	/* nodeid of sender */
+	uint16_t		h_length;
+	uint8_t			h_cmd;		/* DLM_MSG, DLM_RCOM */
+	uint8_t			h_pad;
+};
+
+
+#define DLM_MSG_REQUEST		1
+#define DLM_MSG_CONVERT		2
+#define DLM_MSG_UNLOCK		3
+#define DLM_MSG_CANCEL		4
+#define DLM_MSG_REQUEST_REPLY	5
+#define DLM_MSG_CONVERT_REPLY	6
+#define DLM_MSG_UNLOCK_REPLY	7
+#define DLM_MSG_CANCEL_REPLY	8
+#define DLM_MSG_GRANT		9
+#define DLM_MSG_BAST		10
+#define DLM_MSG_LOOKUP		11
+#define DLM_MSG_REMOVE		12
+#define DLM_MSG_LOOKUP_REPLY	13
+
+struct dlm_message {
+	struct dlm_header	m_header;
+	uint32_t		m_type;		/* DLM_MSG_ */
+	uint32_t		m_nodeid;
+	uint32_t		m_pid;
+	uint32_t		m_lkid;		/* lkid on sender */
+	uint32_t		m_remid;	/* lkid on receiver */
+	uint32_t		m_parent_lkid;
+	uint32_t		m_parent_remid;
+	uint32_t		m_exflags;
+	uint32_t		m_sbflags;
+	uint32_t		m_flags;
+	uint32_t		m_lvbseq;
+	uint32_t		m_hash;
+	int			m_status;
+	int			m_grmode;
+	int			m_rqmode;
+	int			m_bastmode;
+	int			m_asts;
+	int			m_result;	/* 0 or -EXXX */
+	char			m_extra[0];	/* name or lvb */
+};
+
+
+#define DLM_RS_NODES		0x00000001
+#define DLM_RS_NODES_ALL	0x00000002
+#define DLM_RS_DIR		0x00000004
+#define DLM_RS_DIR_ALL		0x00000008
+#define DLM_RS_LOCKS		0x00000010
+#define DLM_RS_LOCKS_ALL	0x00000020
+#define DLM_RS_DONE		0x00000040
+#define DLM_RS_DONE_ALL		0x00000080
+
+#define DLM_RCOM_STATUS		1
+#define DLM_RCOM_NAMES		2
+#define DLM_RCOM_LOOKUP		3
+#define DLM_RCOM_LOCK		4
+#define DLM_RCOM_STATUS_REPLY	5
+#define DLM_RCOM_NAMES_REPLY	6
+#define DLM_RCOM_LOOKUP_REPLY	7
+#define DLM_RCOM_LOCK_REPLY	8
+
+struct dlm_rcom {
+	struct dlm_header	rc_header;
+	uint32_t		rc_type;	/* DLM_RCOM_ */
+	int			rc_result;	/* multi-purpose */
+	uint64_t		rc_id;		/* match reply with request */
+	char			rc_buf[0];
+};
+
+struct rcom_config {
+	uint32_t		rf_lvblen;
+	uint32_t		rf_lsflags;
+	uint64_t		rf_unused;
+};
+
+struct rcom_lock {
+	uint32_t		rl_ownpid;
+	uint32_t		rl_lkid;
+	uint32_t		rl_remid;
+	uint32_t		rl_parent_lkid;
+	uint32_t		rl_parent_remid;
+	uint32_t		rl_exflags;
+	uint32_t		rl_flags;
+	uint32_t		rl_lvbseq;
+	int			rl_result;
+	int8_t			rl_rqmode;
+	int8_t			rl_grmode;
+	int8_t			rl_status;
+	int8_t			rl_asts;
+	uint16_t		rl_wait_type;
+	uint16_t		rl_namelen;
+	char			rl_name[DLM_RESNAME_MAXLEN];
+	char			rl_lvb[0];
+};
+
+struct dlm_ls {
+	struct list_head	ls_list;	/* list of lockspaces */
+	dlm_lockspace_t		*ls_local_handle;
+	uint32_t		ls_global_id;	/* global unique lockspace ID */
+	uint32_t		ls_exflags;
+	int			ls_lvblen;
+	int			ls_count;	/* reference count */
+	unsigned long		ls_flags;	/* LSFL_ */
+	struct kobject		ls_kobj;
+
+	struct dlm_rsbtable	*ls_rsbtbl;
+	uint32_t		ls_rsbtbl_size;
+
+	struct dlm_lkbtable	*ls_lkbtbl;
+	uint32_t		ls_lkbtbl_size;
+
+	struct dlm_dirtable	*ls_dirtbl;
+	uint32_t		ls_dirtbl_size;
+
+	struct mutex		ls_waiters_mutex;
+	struct list_head	ls_waiters;	/* lkbs needing a reply */
+
+	struct list_head	ls_nodes;	/* current nodes in ls */
+	struct list_head	ls_nodes_gone;	/* dead node list, recovery */
+	int			ls_num_nodes;	/* number of nodes in ls */
+	int			ls_low_nodeid;
+	int			ls_total_weight;
+	int			*ls_node_array;
+
+	struct dlm_rsb		ls_stub_rsb;	/* for returning errors */
+	struct dlm_lkb		ls_stub_lkb;	/* for returning errors */
+	struct dlm_message	ls_stub_ms;	/* for faking a reply */
+
+	struct dentry		*ls_debug_rsb_dentry; /* debugfs */
+	struct dentry		*ls_debug_waiters_dentry; /* debugfs */
+
+	wait_queue_head_t	ls_uevent_wait;	/* user part of join/leave */
+	int			ls_uevent_result;
+
+	struct miscdevice       ls_device;
+
+	/* recovery related */
+
+	struct timer_list	ls_timer;
+	struct task_struct	*ls_recoverd_task;
+	struct mutex		ls_recoverd_active;
+	spinlock_t		ls_recover_lock;
+	uint32_t		ls_recover_status; /* DLM_RS_ */
+	uint64_t		ls_recover_seq;
+	struct dlm_recover	*ls_recover_args;
+	struct rw_semaphore	ls_in_recovery;	/* block local requests */
+	struct list_head	ls_requestqueue;/* queue remote requests */
+	struct mutex		ls_requestqueue_mutex;
+	char			*ls_recover_buf;
+	int			ls_recover_nodeid; /* for debugging */
+	uint64_t		ls_rcom_seq;
+	struct list_head	ls_recover_list;
+	spinlock_t		ls_recover_list_lock;
+	int			ls_recover_list_count;
+	wait_queue_head_t	ls_wait_general;
+	struct mutex		ls_clear_proc_locks;
+
+	struct list_head	ls_root_list;	/* root resources */
+	struct rw_semaphore	ls_root_sem;	/* protect root_list */
+
+	int			ls_namelen;
+	char			ls_name[1];
+};
+
+#define LSFL_WORK		0
+#define LSFL_RUNNING		1
+#define LSFL_RECOVERY_STOP	2
+#define LSFL_RCOM_READY		3
+#define LSFL_UEVENT_WAIT	4
+
+/* much of this is just saving user space pointers associated with the
+   lock that we pass back to the user lib with an ast */
+
+struct dlm_user_args {
+	struct dlm_user_proc	*proc; /* each process that opens the lockspace
+					  device has private data
+					  (dlm_user_proc) on the struct file,
+					  the process's locks point back to it*/
+	struct dlm_lksb		lksb;
+	int			old_mode;
+	int			update_user_lvb;
+	struct dlm_lksb __user	*user_lksb;
+	void __user		*castparam;
+	void __user		*castaddr;
+	void __user		*bastparam;
+	void __user		*bastaddr;
+};
+
+#define DLM_PROC_FLAGS_CLOSING 1
+#define DLM_PROC_FLAGS_COMPAT  2
+
+/* locks list is kept so we can remove all a process's locks when it
+   exits (or orphan those that are persistent) */
+
+struct dlm_user_proc {
+	dlm_lockspace_t		*lockspace;
+	unsigned long		flags; /* DLM_PROC_FLAGS */
+	struct list_head	asts;
+	spinlock_t		asts_spin;
+	struct list_head	locks;
+	spinlock_t		locks_spin;
+	wait_queue_head_t	wait;
+};
+
+static inline int dlm_locking_stopped(struct dlm_ls *ls)
+{
+	return !test_bit(LSFL_RUNNING, &ls->ls_flags);
+}
+
+static inline int dlm_recovery_stopped(struct dlm_ls *ls)
+{
+	return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+}
+
+static inline int dlm_no_directory(struct dlm_ls *ls)
+{
+	return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0;
+}
+
+#endif				/* __DLM_INTERNAL_DOT_H__ */
+
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
new file mode 100644
index 000000000000..3f2befa4797b
--- /dev/null
+++ b/fs/dlm/lock.c
@@ -0,0 +1,3871 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/* Central locking logic has four stages:
+
+   dlm_lock()
+   dlm_unlock()
+
+   request_lock(ls, lkb)
+   convert_lock(ls, lkb)
+   unlock_lock(ls, lkb)
+   cancel_lock(ls, lkb)
+
+   _request_lock(r, lkb)
+   _convert_lock(r, lkb)
+   _unlock_lock(r, lkb)
+   _cancel_lock(r, lkb)
+
+   do_request(r, lkb)
+   do_convert(r, lkb)
+   do_unlock(r, lkb)
+   do_cancel(r, lkb)
+
+   Stage 1 (lock, unlock) is mainly about checking input args and
+   splitting into one of the four main operations:
+
+       dlm_lock          = request_lock
+       dlm_lock+CONVERT  = convert_lock
+       dlm_unlock        = unlock_lock
+       dlm_unlock+CANCEL = cancel_lock
+
+   Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
+   provided to the next stage.
+
+   Stage 3, _xxxx_lock(), determines if the operation is local or remote.
+   When remote, it calls send_xxxx(), when local it calls do_xxxx().
+
+   Stage 4, do_xxxx(), is the guts of the operation.  It manipulates the
+   given rsb and lkb and queues callbacks.
+
+   For remote operations, send_xxxx() results in the corresponding do_xxxx()
+   function being executed on the remote node.  The connecting send/receive
+   calls on local (L) and remote (R) nodes:
+
+   L: send_xxxx()              ->  R: receive_xxxx()
+                                   R: do_xxxx()
+   L: receive_xxxx_reply()     <-  R: send_xxxx_reply()
+*/
+#include <linux/types.h>
+#include "dlm_internal.h"
+#include <linux/dlm_device.h>
+#include "memory.h"
+#include "lowcomms.h"
+#include "requestqueue.h"
+#include "util.h"
+#include "dir.h"
+#include "member.h"
+#include "lockspace.h"
+#include "ast.h"
+#include "lock.h"
+#include "rcom.h"
+#include "recover.h"
+#include "lvb_table.h"
+#include "user.h"
+#include "config.h"
+
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static int send_remove(struct dlm_rsb *r);
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+				    struct dlm_message *ms);
+static int receive_extralen(struct dlm_message *ms);
+
+/*
+ * Lock compatibilty matrix - thanks Steve
+ * UN = Unlocked state. Not really a state, used as a flag
+ * PD = Padding. Used to make the matrix a nice power of two in size
+ * Other states are the same as the VMS DLM.
+ * Usage: matrix[grmode+1][rqmode+1]  (although m[rq+1][gr+1] is the same)
+ */
+
+static const int __dlm_compat_matrix[8][8] = {
+      /* UN NL CR CW PR PW EX PD */
+        {1, 1, 1, 1, 1, 1, 1, 0},       /* UN */
+        {1, 1, 1, 1, 1, 1, 1, 0},       /* NL */
+        {1, 1, 1, 1, 1, 1, 0, 0},       /* CR */
+        {1, 1, 1, 1, 0, 0, 0, 0},       /* CW */
+        {1, 1, 1, 0, 1, 0, 0, 0},       /* PR */
+        {1, 1, 1, 0, 0, 0, 0, 0},       /* PW */
+        {1, 1, 0, 0, 0, 0, 0, 0},       /* EX */
+        {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
+};
+
+/*
+ * This defines the direction of transfer of LVB data.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ * 1 = LVB is returned to the caller
+ * 0 = LVB is written to the resource
+ * -1 = nothing happens to the LVB
+ */
+
+const int dlm_lvb_operations[8][8] = {
+        /* UN   NL  CR  CW  PR  PW  EX  PD*/
+        {  -1,  1,  1,  1,  1,  1,  1, -1 }, /* UN */
+        {  -1,  1,  1,  1,  1,  1,  1,  0 }, /* NL */
+        {  -1, -1,  1,  1,  1,  1,  1,  0 }, /* CR */
+        {  -1, -1, -1,  1,  1,  1,  1,  0 }, /* CW */
+        {  -1, -1, -1, -1,  1,  1,  1,  0 }, /* PR */
+        {  -1,  0,  0,  0,  0,  0,  1,  0 }, /* PW */
+        {  -1,  0,  0,  0,  0,  0,  0,  0 }, /* EX */
+        {  -1,  0,  0,  0,  0,  0,  0,  0 }  /* PD */
+};
+
+#define modes_compat(gr, rq) \
+	__dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
+
+int dlm_modes_compat(int mode1, int mode2)
+{
+	return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
+}
+
+/*
+ * Compatibility matrix for conversions with QUECVT set.
+ * Granted mode is the row; requested mode is the column.
+ * Usage: matrix[grmode+1][rqmode+1]
+ */
+
+static const int __quecvt_compat_matrix[8][8] = {
+      /* UN NL CR CW PR PW EX PD */
+        {0, 0, 0, 0, 0, 0, 0, 0},       /* UN */
+        {0, 0, 1, 1, 1, 1, 1, 0},       /* NL */
+        {0, 0, 0, 1, 1, 1, 1, 0},       /* CR */
+        {0, 0, 0, 0, 1, 1, 1, 0},       /* CW */
+        {0, 0, 0, 1, 0, 1, 1, 0},       /* PR */
+        {0, 0, 0, 0, 0, 0, 1, 0},       /* PW */
+        {0, 0, 0, 0, 0, 0, 0, 0},       /* EX */
+        {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
+};
+
+void dlm_print_lkb(struct dlm_lkb *lkb)
+{
+	printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
+	       "     status %d rqmode %d grmode %d wait_type %d ast_type %d\n",
+	       lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
+	       lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
+	       lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
+}
+
+void dlm_print_rsb(struct dlm_rsb *r)
+{
+	printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
+	       r->res_nodeid, r->res_flags, r->res_first_lkid,
+	       r->res_recover_locks_count, r->res_name);
+}
+
+void dlm_dump_rsb(struct dlm_rsb *r)
+{
+	struct dlm_lkb *lkb;
+
+	dlm_print_rsb(r);
+
+	printk(KERN_ERR "rsb: root_list empty %d recover_list empty %d\n",
+	       list_empty(&r->res_root_list), list_empty(&r->res_recover_list));
+	printk(KERN_ERR "rsb lookup list\n");
+	list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
+		dlm_print_lkb(lkb);
+	printk(KERN_ERR "rsb grant queue:\n");
+	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
+		dlm_print_lkb(lkb);
+	printk(KERN_ERR "rsb convert queue:\n");
+	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
+		dlm_print_lkb(lkb);
+	printk(KERN_ERR "rsb wait queue:\n");
+	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
+		dlm_print_lkb(lkb);
+}
+
+/* Threads cannot use the lockspace while it's being recovered */
+
+static inline void lock_recovery(struct dlm_ls *ls)
+{
+	down_read(&ls->ls_in_recovery);
+}
+
+static inline void unlock_recovery(struct dlm_ls *ls)
+{
+	up_read(&ls->ls_in_recovery);
+}
+
+static inline int lock_recovery_try(struct dlm_ls *ls)
+{
+	return down_read_trylock(&ls->ls_in_recovery);
+}
+
+static inline int can_be_queued(struct dlm_lkb *lkb)
+{
+	return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
+}
+
+static inline int force_blocking_asts(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
+}
+
+static inline int is_demoted(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
+}
+
+static inline int is_remote(struct dlm_rsb *r)
+{
+	DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
+	return !!r->res_nodeid;
+}
+
+static inline int is_process_copy(struct dlm_lkb *lkb)
+{
+	return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
+}
+
+static inline int is_master_copy(struct dlm_lkb *lkb)
+{
+	if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+		DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb););
+	return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
+}
+
+static inline int middle_conversion(struct dlm_lkb *lkb)
+{
+	if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
+	    (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
+		return 1;
+	return 0;
+}
+
+static inline int down_conversion(struct dlm_lkb *lkb)
+{
+	return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
+}
+
+static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+	if (is_master_copy(lkb))
+		return;
+
+	DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
+
+	lkb->lkb_lksb->sb_status = rv;
+	lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
+
+	dlm_add_ast(lkb, AST_COMP);
+}
+
+static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
+{
+	if (is_master_copy(lkb))
+		send_bast(r, lkb, rqmode);
+	else {
+		lkb->lkb_bastmode = rqmode;
+		dlm_add_ast(lkb, AST_BAST);
+	}
+}
+
+/*
+ * Basic operations on rsb's and lkb's
+ */
+
+static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len)
+{
+	struct dlm_rsb *r;
+
+	r = allocate_rsb(ls, len);
+	if (!r)
+		return NULL;
+
+	r->res_ls = ls;
+	r->res_length = len;
+	memcpy(r->res_name, name, len);
+	mutex_init(&r->res_mutex);
+
+	INIT_LIST_HEAD(&r->res_lookup);
+	INIT_LIST_HEAD(&r->res_grantqueue);
+	INIT_LIST_HEAD(&r->res_convertqueue);
+	INIT_LIST_HEAD(&r->res_waitqueue);
+	INIT_LIST_HEAD(&r->res_root_list);
+	INIT_LIST_HEAD(&r->res_recover_list);
+
+	return r;
+}
+
+static int search_rsb_list(struct list_head *head, char *name, int len,
+			   unsigned int flags, struct dlm_rsb **r_ret)
+{
+	struct dlm_rsb *r;
+	int error = 0;
+
+	list_for_each_entry(r, head, res_hashchain) {
+		if (len == r->res_length && !memcmp(name, r->res_name, len))
+			goto found;
+	}
+	return -EBADR;
+
+ found:
+	if (r->res_nodeid && (flags & R_MASTER))
+		error = -ENOTBLK;
+	*r_ret = r;
+	return error;
+}
+
+static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
+		       unsigned int flags, struct dlm_rsb **r_ret)
+{
+	struct dlm_rsb *r;
+	int error;
+
+	error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r);
+	if (!error) {
+		kref_get(&r->res_ref);
+		goto out;
+	}
+	error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
+	if (error)
+		goto out;
+
+	list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list);
+
+	if (dlm_no_directory(ls))
+		goto out;
+
+	if (r->res_nodeid == -1) {
+		rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
+		r->res_first_lkid = 0;
+	} else if (r->res_nodeid > 0) {
+		rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
+		r->res_first_lkid = 0;
+	} else {
+		DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r););
+		DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),);
+	}
+ out:
+	*r_ret = r;
+	return error;
+}
+
+static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
+		      unsigned int flags, struct dlm_rsb **r_ret)
+{
+	int error;
+	write_lock(&ls->ls_rsbtbl[b].lock);
+	error = _search_rsb(ls, name, len, b, flags, r_ret);
+	write_unlock(&ls->ls_rsbtbl[b].lock);
+	return error;
+}
+
+/*
+ * Find rsb in rsbtbl and potentially create/add one
+ *
+ * Delaying the release of rsb's has a similar benefit to applications keeping
+ * NL locks on an rsb, but without the guarantee that the cached master value
+ * will still be valid when the rsb is reused.  Apps aren't always smart enough
+ * to keep NL locks on an rsb that they may lock again shortly; this can lead
+ * to excessive master lookups and removals if we don't delay the release.
+ *
+ * Searching for an rsb means looking through both the normal list and toss
+ * list.  When found on the toss list the rsb is moved to the normal list with
+ * ref count of 1; when found on normal list the ref count is incremented.
+ */
+
+static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
+		    unsigned int flags, struct dlm_rsb **r_ret)
+{
+	struct dlm_rsb *r, *tmp;
+	uint32_t hash, bucket;
+	int error = 0;
+
+	if (dlm_no_directory(ls))
+		flags |= R_CREATE;
+
+	hash = jhash(name, namelen, 0);
+	bucket = hash & (ls->ls_rsbtbl_size - 1);
+
+	error = search_rsb(ls, name, namelen, bucket, flags, &r);
+	if (!error)
+		goto out;
+
+	if (error == -EBADR && !(flags & R_CREATE))
+		goto out;
+
+	/* the rsb was found but wasn't a master copy */
+	if (error == -ENOTBLK)
+		goto out;
+
+	error = -ENOMEM;
+	r = create_rsb(ls, name, namelen);
+	if (!r)
+		goto out;
+
+	r->res_hash = hash;
+	r->res_bucket = bucket;
+	r->res_nodeid = -1;
+	kref_init(&r->res_ref);
+
+	/* With no directory, the master can be set immediately */
+	if (dlm_no_directory(ls)) {
+		int nodeid = dlm_dir_nodeid(r);
+		if (nodeid == dlm_our_nodeid())
+			nodeid = 0;
+		r->res_nodeid = nodeid;
+	}
+
+	write_lock(&ls->ls_rsbtbl[bucket].lock);
+	error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
+	if (!error) {
+		write_unlock(&ls->ls_rsbtbl[bucket].lock);
+		free_rsb(r);
+		r = tmp;
+		goto out;
+	}
+	list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
+	write_unlock(&ls->ls_rsbtbl[bucket].lock);
+	error = 0;
+ out:
+	*r_ret = r;
+	return error;
+}
+
+int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
+		 unsigned int flags, struct dlm_rsb **r_ret)
+{
+	return find_rsb(ls, name, namelen, flags, r_ret);
+}
+
+/* This is only called to add a reference when the code already holds
+   a valid reference to the rsb, so there's no need for locking. */
+
+static inline void hold_rsb(struct dlm_rsb *r)
+{
+	kref_get(&r->res_ref);
+}
+
+void dlm_hold_rsb(struct dlm_rsb *r)
+{
+	hold_rsb(r);
+}
+
+static void toss_rsb(struct kref *kref)
+{
+	struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+	struct dlm_ls *ls = r->res_ls;
+
+	DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
+	kref_init(&r->res_ref);
+	list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss);
+	r->res_toss_time = jiffies;
+	if (r->res_lvbptr) {
+		free_lvb(r->res_lvbptr);
+		r->res_lvbptr = NULL;
+	}
+}
+
+/* When all references to the rsb are gone it's transfered to
+   the tossed list for later disposal. */
+
+static void put_rsb(struct dlm_rsb *r)
+{
+	struct dlm_ls *ls = r->res_ls;
+	uint32_t bucket = r->res_bucket;
+
+	write_lock(&ls->ls_rsbtbl[bucket].lock);
+	kref_put(&r->res_ref, toss_rsb);
+	write_unlock(&ls->ls_rsbtbl[bucket].lock);
+}
+
+void dlm_put_rsb(struct dlm_rsb *r)
+{
+	put_rsb(r);
+}
+
+/* See comment for unhold_lkb */
+
+static void unhold_rsb(struct dlm_rsb *r)
+{
+	int rv;
+	rv = kref_put(&r->res_ref, toss_rsb);
+	DLM_ASSERT(!rv, dlm_dump_rsb(r););
+}
+
+static void kill_rsb(struct kref *kref)
+{
+	struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
+
+	/* All work is done after the return from kref_put() so we
+	   can release the write_lock before the remove and free. */
+
+	DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r););
+	DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r););
+	DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r););
+	DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r););
+	DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r););
+	DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r););
+}
+
+/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
+   The rsb must exist as long as any lkb's for it do. */
+
+static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	hold_rsb(r);
+	lkb->lkb_resource = r;
+}
+
+static void detach_lkb(struct dlm_lkb *lkb)
+{
+	if (lkb->lkb_resource) {
+		put_rsb(lkb->lkb_resource);
+		lkb->lkb_resource = NULL;
+	}
+}
+
+static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+{
+	struct dlm_lkb *lkb, *tmp;
+	uint32_t lkid = 0;
+	uint16_t bucket;
+
+	lkb = allocate_lkb(ls);
+	if (!lkb)
+		return -ENOMEM;
+
+	lkb->lkb_nodeid = -1;
+	lkb->lkb_grmode = DLM_LOCK_IV;
+	kref_init(&lkb->lkb_ref);
+	INIT_LIST_HEAD(&lkb->lkb_ownqueue);
+
+	get_random_bytes(&bucket, sizeof(bucket));
+	bucket &= (ls->ls_lkbtbl_size - 1);
+
+	write_lock(&ls->ls_lkbtbl[bucket].lock);
+
+	/* counter can roll over so we must verify lkid is not in use */
+
+	while (lkid == 0) {
+		lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16);
+
+		list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list,
+				    lkb_idtbl_list) {
+			if (tmp->lkb_id != lkid)
+				continue;
+			lkid = 0;
+			break;
+		}
+	}
+
+	lkb->lkb_id = lkid;
+	list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list);
+	write_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+	*lkb_ret = lkb;
+	return 0;
+}
+
+static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid)
+{
+	uint16_t bucket = lkid & 0xFFFF;
+	struct dlm_lkb *lkb;
+
+	list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) {
+		if (lkb->lkb_id == lkid)
+			return lkb;
+	}
+	return NULL;
+}
+
+static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
+{
+	struct dlm_lkb *lkb;
+	uint16_t bucket = lkid & 0xFFFF;
+
+	if (bucket >= ls->ls_lkbtbl_size)
+		return -EBADSLT;
+
+	read_lock(&ls->ls_lkbtbl[bucket].lock);
+	lkb = __find_lkb(ls, lkid);
+	if (lkb)
+		kref_get(&lkb->lkb_ref);
+	read_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+	*lkb_ret = lkb;
+	return lkb ? 0 : -ENOENT;
+}
+
+static void kill_lkb(struct kref *kref)
+{
+	struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
+
+	/* All work is done after the return from kref_put() so we
+	   can release the write_lock before the detach_lkb */
+
+	DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+}
+
+/* __put_lkb() is used when an lkb may not have an rsb attached to
+   it so we need to provide the lockspace explicitly */
+
+static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+	uint16_t bucket = lkb->lkb_id & 0xFFFF;
+
+	write_lock(&ls->ls_lkbtbl[bucket].lock);
+	if (kref_put(&lkb->lkb_ref, kill_lkb)) {
+		list_del(&lkb->lkb_idtbl_list);
+		write_unlock(&ls->ls_lkbtbl[bucket].lock);
+
+		detach_lkb(lkb);
+
+		/* for local/process lkbs, lvbptr points to caller's lksb */
+		if (lkb->lkb_lvbptr && is_master_copy(lkb))
+			free_lvb(lkb->lkb_lvbptr);
+		free_lkb(lkb);
+		return 1;
+	} else {
+		write_unlock(&ls->ls_lkbtbl[bucket].lock);
+		return 0;
+	}
+}
+
+int dlm_put_lkb(struct dlm_lkb *lkb)
+{
+	struct dlm_ls *ls;
+
+	DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb););
+	DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb););
+
+	ls = lkb->lkb_resource->res_ls;
+	return __put_lkb(ls, lkb);
+}
+
+/* This is only called to add a reference when the code already holds
+   a valid reference to the lkb, so there's no need for locking. */
+
+static inline void hold_lkb(struct dlm_lkb *lkb)
+{
+	kref_get(&lkb->lkb_ref);
+}
+
+/* This is called when we need to remove a reference and are certain
+   it's not the last ref.  e.g. del_lkb is always called between a
+   find_lkb/put_lkb and is always the inverse of a previous add_lkb.
+   put_lkb would work fine, but would involve unnecessary locking */
+
+static inline void unhold_lkb(struct dlm_lkb *lkb)
+{
+	int rv;
+	rv = kref_put(&lkb->lkb_ref, kill_lkb);
+	DLM_ASSERT(!rv, dlm_print_lkb(lkb););
+}
+
+static void lkb_add_ordered(struct list_head *new, struct list_head *head,
+			    int mode)
+{
+	struct dlm_lkb *lkb = NULL;
+
+	list_for_each_entry(lkb, head, lkb_statequeue)
+		if (lkb->lkb_rqmode < mode)
+			break;
+
+	if (!lkb)
+		list_add_tail(new, head);
+	else
+		__list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue);
+}
+
+/* add/remove lkb to rsb's grant/convert/wait queue */
+
+static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
+{
+	kref_get(&lkb->lkb_ref);
+
+	DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
+
+	lkb->lkb_status = status;
+
+	switch (status) {
+	case DLM_LKSTS_WAITING:
+		if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+			list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
+		else
+			list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
+		break;
+	case DLM_LKSTS_GRANTED:
+		/* convention says granted locks kept in order of grmode */
+		lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
+				lkb->lkb_grmode);
+		break;
+	case DLM_LKSTS_CONVERT:
+		if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
+			list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
+		else
+			list_add_tail(&lkb->lkb_statequeue,
+				      &r->res_convertqueue);
+		break;
+	default:
+		DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
+	}
+}
+
+static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	lkb->lkb_status = 0;
+	list_del(&lkb->lkb_statequeue);
+	unhold_lkb(lkb);
+}
+
+static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
+{
+	hold_lkb(lkb);
+	del_lkb(r, lkb);
+	add_lkb(r, lkb, sts);
+	unhold_lkb(lkb);
+}
+
+/* add/remove lkb from global waiters list of lkb's waiting for
+   a reply from a remote node */
+
+static void add_to_waiters(struct dlm_lkb *lkb, int mstype)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+
+	mutex_lock(&ls->ls_waiters_mutex);
+	if (lkb->lkb_wait_type) {
+		log_print("add_to_waiters error %d", lkb->lkb_wait_type);
+		goto out;
+	}
+	lkb->lkb_wait_type = mstype;
+	kref_get(&lkb->lkb_ref);
+	list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
+ out:
+	mutex_unlock(&ls->ls_waiters_mutex);
+}
+
+static int _remove_from_waiters(struct dlm_lkb *lkb)
+{
+	int error = 0;
+
+	if (!lkb->lkb_wait_type) {
+		log_print("remove_from_waiters error");
+		error = -EINVAL;
+		goto out;
+	}
+	lkb->lkb_wait_type = 0;
+	list_del(&lkb->lkb_wait_reply);
+	unhold_lkb(lkb);
+ out:
+	return error;
+}
+
+static int remove_from_waiters(struct dlm_lkb *lkb)
+{
+	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	int error;
+
+	mutex_lock(&ls->ls_waiters_mutex);
+	error = _remove_from_waiters(lkb);
+	mutex_unlock(&ls->ls_waiters_mutex);
+	return error;
+}
+
+static void dir_remove(struct dlm_rsb *r)
+{
+	int to_nodeid;
+
+	if (dlm_no_directory(r->res_ls))
+		return;
+
+	to_nodeid = dlm_dir_nodeid(r);
+	if (to_nodeid != dlm_our_nodeid())
+		send_remove(r);
+	else
+		dlm_dir_remove_entry(r->res_ls, to_nodeid,
+				     r->res_name, r->res_length);
+}
+
+/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is
+   found since they are in order of newest to oldest? */
+
+static int shrink_bucket(struct dlm_ls *ls, int b)
+{
+	struct dlm_rsb *r;
+	int count = 0, found;
+
+	for (;;) {
+		found = 0;
+		write_lock(&ls->ls_rsbtbl[b].lock);
+		list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
+					    res_hashchain) {
+			if (!time_after_eq(jiffies, r->res_toss_time +
+					   dlm_config.toss_secs * HZ))
+				continue;
+			found = 1;
+			break;
+		}
+
+		if (!found) {
+			write_unlock(&ls->ls_rsbtbl[b].lock);
+			break;
+		}
+
+		if (kref_put(&r->res_ref, kill_rsb)) {
+			list_del(&r->res_hashchain);
+			write_unlock(&ls->ls_rsbtbl[b].lock);
+
+			if (is_master(r))
+				dir_remove(r);
+			free_rsb(r);
+			count++;
+		} else {
+			write_unlock(&ls->ls_rsbtbl[b].lock);
+			log_error(ls, "tossed rsb in use %s", r->res_name);
+		}
+	}
+
+	return count;
+}
+
+void dlm_scan_rsbs(struct dlm_ls *ls)
+{
+	int i;
+
+	if (dlm_locking_stopped(ls))
+		return;
+
+	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+		shrink_bucket(ls, i);
+		cond_resched();
+	}
+}
+
+/* lkb is master or local copy */
+
+static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int b, len = r->res_ls->ls_lvblen;
+
+	/* b=1 lvb returned to caller
+	   b=0 lvb written to rsb or invalidated
+	   b=-1 do nothing */
+
+	b =  dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+
+	if (b == 1) {
+		if (!lkb->lkb_lvbptr)
+			return;
+
+		if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+			return;
+
+		if (!r->res_lvbptr)
+			return;
+
+		memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
+		lkb->lkb_lvbseq = r->res_lvbseq;
+
+	} else if (b == 0) {
+		if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+			rsb_set_flag(r, RSB_VALNOTVALID);
+			return;
+		}
+
+		if (!lkb->lkb_lvbptr)
+			return;
+
+		if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+			return;
+
+		if (!r->res_lvbptr)
+			r->res_lvbptr = allocate_lvb(r->res_ls);
+
+		if (!r->res_lvbptr)
+			return;
+
+		memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
+		r->res_lvbseq++;
+		lkb->lkb_lvbseq = r->res_lvbseq;
+		rsb_clear_flag(r, RSB_VALNOTVALID);
+	}
+
+	if (rsb_flag(r, RSB_VALNOTVALID))
+		lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
+}
+
+static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	if (lkb->lkb_grmode < DLM_LOCK_PW)
+		return;
+
+	if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
+		rsb_set_flag(r, RSB_VALNOTVALID);
+		return;
+	}
+
+	if (!lkb->lkb_lvbptr)
+		return;
+
+	if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+		return;
+
+	if (!r->res_lvbptr)
+		r->res_lvbptr = allocate_lvb(r->res_ls);
+
+	if (!r->res_lvbptr)
+		return;
+
+	memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+	r->res_lvbseq++;
+	rsb_clear_flag(r, RSB_VALNOTVALID);
+}
+
+/* lkb is process copy (pc) */
+
+static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			    struct dlm_message *ms)
+{
+	int b;
+
+	if (!lkb->lkb_lvbptr)
+		return;
+
+	if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+		return;
+
+	b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
+	if (b == 1) {
+		int len = receive_extralen(ms);
+		memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
+		lkb->lkb_lvbseq = ms->m_lvbseq;
+	}
+}
+
+/* Manipulate lkb's on rsb's convert/granted/waiting queues
+   remove_lock -- used for unlock, removes lkb from granted
+   revert_lock -- used for cancel, moves lkb from convert to granted
+   grant_lock  -- used for request and convert, adds lkb to granted or
+                  moves lkb from convert or waiting to granted
+
+   Each of these is used for master or local copy lkb's.  There is
+   also a _pc() variation used to make the corresponding change on
+   a process copy (pc) lkb. */
+
+static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	del_lkb(r, lkb);
+	lkb->lkb_grmode = DLM_LOCK_IV;
+	/* this unhold undoes the original ref from create_lkb()
+	   so this leads to the lkb being freed */
+	unhold_lkb(lkb);
+}
+
+static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	set_lvb_unlock(r, lkb);
+	_remove_lock(r, lkb);
+}
+
+static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	_remove_lock(r, lkb);
+}
+
+static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	lkb->lkb_rqmode = DLM_LOCK_IV;
+
+	switch (lkb->lkb_status) {
+	case DLM_LKSTS_GRANTED:
+		break;
+	case DLM_LKSTS_CONVERT:
+		move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+		break;
+	case DLM_LKSTS_WAITING:
+		del_lkb(r, lkb);
+		lkb->lkb_grmode = DLM_LOCK_IV;
+		/* this unhold undoes the original ref from create_lkb()
+		   so this leads to the lkb being freed */
+		unhold_lkb(lkb);
+		break;
+	default:
+		log_print("invalid status for revert %d", lkb->lkb_status);
+	}
+}
+
+static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	revert_lock(r, lkb);
+}
+
+static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	if (lkb->lkb_grmode != lkb->lkb_rqmode) {
+		lkb->lkb_grmode = lkb->lkb_rqmode;
+		if (lkb->lkb_status)
+			move_lkb(r, lkb, DLM_LKSTS_GRANTED);
+		else
+			add_lkb(r, lkb, DLM_LKSTS_GRANTED);
+	}
+
+	lkb->lkb_rqmode = DLM_LOCK_IV;
+}
+
+static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	set_lvb_lock(r, lkb);
+	_grant_lock(r, lkb);
+	lkb->lkb_highbast = 0;
+}
+
+static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			  struct dlm_message *ms)
+{
+	set_lvb_lock_pc(r, lkb, ms);
+	_grant_lock(r, lkb);
+}
+
+/* called by grant_pending_locks() which means an async grant message must
+   be sent to the requesting node in addition to granting the lock if the
+   lkb belongs to a remote node. */
+
+static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	grant_lock(r, lkb);
+	if (is_master_copy(lkb))
+		send_grant(r, lkb);
+	else
+		queue_cast(r, lkb, 0);
+}
+
+static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
+{
+	struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
+					   lkb_statequeue);
+	if (lkb->lkb_id == first->lkb_id)
+		return 1;
+
+	return 0;
+}
+
+/* Check if the given lkb conflicts with another lkb on the queue. */
+
+static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
+{
+	struct dlm_lkb *this;
+
+	list_for_each_entry(this, head, lkb_statequeue) {
+		if (this == lkb)
+			continue;
+		if (!modes_compat(this, lkb))
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * "A conversion deadlock arises with a pair of lock requests in the converting
+ * queue for one resource.  The granted mode of each lock blocks the requested
+ * mode of the other lock."
+ *
+ * Part 2: if the granted mode of lkb is preventing the first lkb in the
+ * convert queue from being granted, then demote lkb (set grmode to NL).
+ * This second form requires that we check for conv-deadlk even when
+ * now == 0 in _can_be_granted().
+ *
+ * Example:
+ * Granted Queue: empty
+ * Convert Queue: NL->EX (first lock)
+ *                PR->EX (second lock)
+ *
+ * The first lock can't be granted because of the granted mode of the second
+ * lock and the second lock can't be granted because it's not first in the
+ * list.  We demote the granted mode of the second lock (the lkb passed to this
+ * function).
+ *
+ * After the resolution, the "grant pending" function needs to go back and try
+ * to grant locks on the convert queue again since the first lock can now be
+ * granted.
+ */
+
+static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb)
+{
+	struct dlm_lkb *this, *first = NULL, *self = NULL;
+
+	list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) {
+		if (!first)
+			first = this;
+		if (this == lkb) {
+			self = lkb;
+			continue;
+		}
+
+		if (!modes_compat(this, lkb) && !modes_compat(lkb, this))
+			return 1;
+	}
+
+	/* if lkb is on the convert queue and is preventing the first
+	   from being granted, then there's deadlock and we demote lkb.
+	   multiple converting locks may need to do this before the first
+	   converting lock can be granted. */
+
+	if (self && self != first) {
+		if (!modes_compat(lkb, first) &&
+		    !queue_conflict(&rsb->res_grantqueue, first))
+			return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Return 1 if the lock can be granted, 0 otherwise.
+ * Also detect and resolve conversion deadlocks.
+ *
+ * lkb is the lock to be granted
+ *
+ * now is 1 if the function is being called in the context of the
+ * immediate request, it is 0 if called later, after the lock has been
+ * queued.
+ *
+ * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
+ */
+
+static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
+{
+	int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
+
+	/*
+	 * 6-10: Version 5.4 introduced an option to address the phenomenon of
+	 * a new request for a NL mode lock being blocked.
+	 *
+	 * 6-11: If the optional EXPEDITE flag is used with the new NL mode
+	 * request, then it would be granted.  In essence, the use of this flag
+	 * tells the Lock Manager to expedite theis request by not considering
+	 * what may be in the CONVERTING or WAITING queues...  As of this
+	 * writing, the EXPEDITE flag can be used only with new requests for NL
+	 * mode locks.  This flag is not valid for conversion requests.
+	 *
+	 * A shortcut.  Earlier checks return an error if EXPEDITE is used in a
+	 * conversion or used with a non-NL requested mode.  We also know an
+	 * EXPEDITE request is always granted immediately, so now must always
+	 * be 1.  The full condition to grant an expedite request: (now &&
+	 * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
+	 * therefore be shortened to just checking the flag.
+	 */
+
+	if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
+		return 1;
+
+	/*
+	 * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
+	 * added to the remaining conditions.
+	 */
+
+	if (queue_conflict(&r->res_grantqueue, lkb))
+		goto out;
+
+	/*
+	 * 6-3: By default, a conversion request is immediately granted if the
+	 * requested mode is compatible with the modes of all other granted
+	 * locks
+	 */
+
+	if (queue_conflict(&r->res_convertqueue, lkb))
+		goto out;
+
+	/*
+	 * 6-5: But the default algorithm for deciding whether to grant or
+	 * queue conversion requests does not by itself guarantee that such
+	 * requests are serviced on a "first come first serve" basis.  This, in
+	 * turn, can lead to a phenomenon known as "indefinate postponement".
+	 *
+	 * 6-7: This issue is dealt with by using the optional QUECVT flag with
+	 * the system service employed to request a lock conversion.  This flag
+	 * forces certain conversion requests to be queued, even if they are
+	 * compatible with the granted modes of other locks on the same
+	 * resource.  Thus, the use of this flag results in conversion requests
+	 * being ordered on a "first come first servce" basis.
+	 *
+	 * DCT: This condition is all about new conversions being able to occur
+	 * "in place" while the lock remains on the granted queue (assuming
+	 * nothing else conflicts.)  IOW if QUECVT isn't set, a conversion
+	 * doesn't _have_ to go onto the convert queue where it's processed in
+	 * order.  The "now" variable is necessary to distinguish converts
+	 * being received and processed for the first time now, because once a
+	 * convert is moved to the conversion queue the condition below applies
+	 * requiring fifo granting.
+	 */
+
+	if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
+		return 1;
+
+	/*
+	 * The NOORDER flag is set to avoid the standard vms rules on grant
+	 * order.
+	 */
+
+	if (lkb->lkb_exflags & DLM_LKF_NOORDER)
+		return 1;
+
+	/*
+	 * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
+	 * granted until all other conversion requests ahead of it are granted
+	 * and/or canceled.
+	 */
+
+	if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
+		return 1;
+
+	/*
+	 * 6-4: By default, a new request is immediately granted only if all
+	 * three of the following conditions are satisfied when the request is
+	 * issued:
+	 * - The queue of ungranted conversion requests for the resource is
+	 *   empty.
+	 * - The queue of ungranted new requests for the resource is empty.
+	 * - The mode of the new request is compatible with the most
+	 *   restrictive mode of all granted locks on the resource.
+	 */
+
+	if (now && !conv && list_empty(&r->res_convertqueue) &&
+	    list_empty(&r->res_waitqueue))
+		return 1;
+
+	/*
+	 * 6-4: Once a lock request is in the queue of ungranted new requests,
+	 * it cannot be granted until the queue of ungranted conversion
+	 * requests is empty, all ungranted new requests ahead of it are
+	 * granted and/or canceled, and it is compatible with the granted mode
+	 * of the most restrictive lock granted on the resource.
+	 */
+
+	if (!now && !conv && list_empty(&r->res_convertqueue) &&
+	    first_in_list(lkb, &r->res_waitqueue))
+		return 1;
+
+ out:
+	/*
+	 * The following, enabled by CONVDEADLK, departs from VMS.
+	 */
+
+	if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) &&
+	    conversion_deadlock_detect(r, lkb)) {
+		lkb->lkb_grmode = DLM_LOCK_NL;
+		lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
+	}
+
+	return 0;
+}
+
+/*
+ * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a
+ * simple way to provide a big optimization to applications that can use them.
+ */
+
+static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
+{
+	uint32_t flags = lkb->lkb_exflags;
+	int rv;
+	int8_t alt = 0, rqmode = lkb->lkb_rqmode;
+
+	rv = _can_be_granted(r, lkb, now);
+	if (rv)
+		goto out;
+
+	if (lkb->lkb_sbflags & DLM_SBF_DEMOTED)
+		goto out;
+
+	if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR)
+		alt = DLM_LOCK_PR;
+	else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW)
+		alt = DLM_LOCK_CW;
+
+	if (alt) {
+		lkb->lkb_rqmode = alt;
+		rv = _can_be_granted(r, lkb, now);
+		if (rv)
+			lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
+		else
+			lkb->lkb_rqmode = rqmode;
+	}
+ out:
+	return rv;
+}
+
+static int grant_pending_convert(struct dlm_rsb *r, int high)
+{
+	struct dlm_lkb *lkb, *s;
+	int hi, demoted, quit, grant_restart, demote_restart;
+
+	quit = 0;
+ restart:
+	grant_restart = 0;
+	demote_restart = 0;
+	hi = DLM_LOCK_IV;
+
+	list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
+		demoted = is_demoted(lkb);
+		if (can_be_granted(r, lkb, 0)) {
+			grant_lock_pending(r, lkb);
+			grant_restart = 1;
+		} else {
+			hi = max_t(int, lkb->lkb_rqmode, hi);
+			if (!demoted && is_demoted(lkb))
+				demote_restart = 1;
+		}
+	}
+
+	if (grant_restart)
+		goto restart;
+	if (demote_restart && !quit) {
+		quit = 1;
+		goto restart;
+	}
+
+	return max_t(int, high, hi);
+}
+
+static int grant_pending_wait(struct dlm_rsb *r, int high)
+{
+	struct dlm_lkb *lkb, *s;
+
+	list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
+		if (can_be_granted(r, lkb, 0))
+			grant_lock_pending(r, lkb);
+                else
+			high = max_t(int, lkb->lkb_rqmode, high);
+	}
+
+	return high;
+}
+
+static void grant_pending_locks(struct dlm_rsb *r)
+{
+	struct dlm_lkb *lkb, *s;
+	int high = DLM_LOCK_IV;
+
+	DLM_ASSERT(is_master(r), dlm_dump_rsb(r););
+
+	high = grant_pending_convert(r, high);
+	high = grant_pending_wait(r, high);
+
+	if (high == DLM_LOCK_IV)
+		return;
+
+	/*
+	 * If there are locks left on the wait/convert queue then send blocking
+	 * ASTs to granted locks based on the largest requested mode (high)
+	 * found above. FIXME: highbast < high comparison not valid for PR/CW.
+	 */
+
+	list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
+		if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) &&
+		    !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) {
+			queue_bast(r, lkb, high);
+			lkb->lkb_highbast = high;
+		}
+	}
+}
+
+static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
+			    struct dlm_lkb *lkb)
+{
+	struct dlm_lkb *gr;
+
+	list_for_each_entry(gr, head, lkb_statequeue) {
+		if (gr->lkb_bastaddr &&
+		    gr->lkb_highbast < lkb->lkb_rqmode &&
+		    !modes_compat(gr, lkb)) {
+			queue_bast(r, gr, lkb->lkb_rqmode);
+			gr->lkb_highbast = lkb->lkb_rqmode;
+		}
+	}
+}
+
+static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	send_bast_queue(r, &r->res_grantqueue, lkb);
+}
+
+static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	send_bast_queue(r, &r->res_grantqueue, lkb);
+	send_bast_queue(r, &r->res_convertqueue, lkb);
+}
+
+/* set_master(r, lkb) -- set the master nodeid of a resource
+
+   The purpose of this function is to set the nodeid field in the given
+   lkb using the nodeid field in the given rsb.  If the rsb's nodeid is
+   known, it can just be copied to the lkb and the function will return
+   0.  If the rsb's nodeid is _not_ known, it needs to be looked up
+   before it can be copied to the lkb.
+
+   When the rsb nodeid is being looked up remotely, the initial lkb
+   causing the lookup is kept on the ls_waiters list waiting for the
+   lookup reply.  Other lkb's waiting for the same rsb lookup are kept
+   on the rsb's res_lookup list until the master is verified.
+
+   Return values:
+   0: nodeid is set in rsb/lkb and the caller should go ahead and use it
+   1: the rsb master is not available and the lkb has been placed on
+      a wait queue
+*/
+
+static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	struct dlm_ls *ls = r->res_ls;
+	int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+
+	if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
+		rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
+		r->res_first_lkid = lkb->lkb_id;
+		lkb->lkb_nodeid = r->res_nodeid;
+		return 0;
+	}
+
+	if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
+		list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
+		return 1;
+	}
+
+	if (r->res_nodeid == 0) {
+		lkb->lkb_nodeid = 0;
+		return 0;
+	}
+
+	if (r->res_nodeid > 0) {
+		lkb->lkb_nodeid = r->res_nodeid;
+		return 0;
+	}
+
+	DLM_ASSERT(r->res_nodeid == -1, dlm_dump_rsb(r););
+
+	dir_nodeid = dlm_dir_nodeid(r);
+
+	if (dir_nodeid != our_nodeid) {
+		r->res_first_lkid = lkb->lkb_id;
+		send_lookup(r, lkb);
+		return 1;
+	}
+
+	for (;;) {
+		/* It's possible for dlm_scand to remove an old rsb for
+		   this same resource from the toss list, us to create
+		   a new one, look up the master locally, and find it
+		   already exists just before dlm_scand does the
+		   dir_remove() on the previous rsb. */
+
+		error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
+				       r->res_length, &ret_nodeid);
+		if (!error)
+			break;
+		log_debug(ls, "dir_lookup error %d %s", error, r->res_name);
+		schedule();
+	}
+
+	if (ret_nodeid == our_nodeid) {
+		r->res_first_lkid = 0;
+		r->res_nodeid = 0;
+		lkb->lkb_nodeid = 0;
+	} else {
+		r->res_first_lkid = lkb->lkb_id;
+		r->res_nodeid = ret_nodeid;
+		lkb->lkb_nodeid = ret_nodeid;
+	}
+	return 0;
+}
+
+static void process_lookup_list(struct dlm_rsb *r)
+{
+	struct dlm_lkb *lkb, *safe;
+
+	list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
+		list_del(&lkb->lkb_rsb_lookup);
+		_request_lock(r, lkb);
+		schedule();
+	}
+}
+
+/* confirm_master -- confirm (or deny) an rsb's master nodeid */
+
+static void confirm_master(struct dlm_rsb *r, int error)
+{
+	struct dlm_lkb *lkb;
+
+	if (!r->res_first_lkid)
+		return;
+
+	switch (error) {
+	case 0:
+	case -EINPROGRESS:
+		r->res_first_lkid = 0;
+		process_lookup_list(r);
+		break;
+
+	case -EAGAIN:
+		/* the remote master didn't queue our NOQUEUE request;
+		   make a waiting lkb the first_lkid */
+
+		r->res_first_lkid = 0;
+
+		if (!list_empty(&r->res_lookup)) {
+			lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
+					 lkb_rsb_lookup);
+			list_del(&lkb->lkb_rsb_lookup);
+			r->res_first_lkid = lkb->lkb_id;
+			_request_lock(r, lkb);
+		} else
+			r->res_nodeid = -1;
+		break;
+
+	default:
+		log_error(r->res_ls, "confirm_master unknown error %d", error);
+	}
+}
+
+static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
+			 int namelen, uint32_t parent_lkid, void *ast,
+			 void *astarg, void *bast, struct dlm_args *args)
+{
+	int rv = -EINVAL;
+
+	/* check for invalid arg usage */
+
+	if (mode < 0 || mode > DLM_LOCK_EX)
+		goto out;
+
+	if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
+		goto out;
+
+	if (flags & DLM_LKF_CANCEL)
+		goto out;
+
+	if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
+		goto out;
+
+	if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
+		goto out;
+
+	if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
+		goto out;
+
+	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
+		goto out;
+
+	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
+		goto out;
+
+	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
+		goto out;
+
+	if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
+		goto out;
+
+	if (!ast || !lksb)
+		goto out;
+
+	if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
+		goto out;
+
+	/* parent/child locks not yet supported */
+	if (parent_lkid)
+		goto out;
+
+	if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
+		goto out;
+
+	/* these args will be copied to the lkb in validate_lock_args,
+	   it cannot be done now because when converting locks, fields in
+	   an active lkb cannot be modified before locking the rsb */
+
+	args->flags = flags;
+	args->astaddr = ast;
+	args->astparam = (long) astarg;
+	args->bastaddr = bast;
+	args->mode = mode;
+	args->lksb = lksb;
+	rv = 0;
+ out:
+	return rv;
+}
+
+static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
+{
+	if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
+ 		      DLM_LKF_FORCEUNLOCK))
+		return -EINVAL;
+
+	args->flags = flags;
+	args->astparam = (long) astarg;
+	return 0;
+}
+
+static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+			      struct dlm_args *args)
+{
+	int rv = -EINVAL;
+
+	if (args->flags & DLM_LKF_CONVERT) {
+		if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+			goto out;
+
+		if (args->flags & DLM_LKF_QUECVT &&
+		    !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
+			goto out;
+
+		rv = -EBUSY;
+		if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+			goto out;
+
+		if (lkb->lkb_wait_type)
+			goto out;
+	}
+
+	lkb->lkb_exflags = args->flags;
+	lkb->lkb_sbflags = 0;
+	lkb->lkb_astaddr = args->astaddr;
+	lkb->lkb_astparam = args->astparam;
+	lkb->lkb_bastaddr = args->bastaddr;
+	lkb->lkb_rqmode = args->mode;
+	lkb->lkb_lksb = args->lksb;
+	lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
+	lkb->lkb_ownpid = (int) current->pid;
+	rv = 0;
+ out:
+	return rv;
+}
+
+static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
+{
+	int rv = -EINVAL;
+
+	if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+		goto out;
+
+	if (args->flags & DLM_LKF_FORCEUNLOCK)
+		goto out_ok;
+
+	if (args->flags & DLM_LKF_CANCEL &&
+	    lkb->lkb_status == DLM_LKSTS_GRANTED)
+		goto out;
+
+	if (!(args->flags & DLM_LKF_CANCEL) &&
+	    lkb->lkb_status != DLM_LKSTS_GRANTED)
+		goto out;
+
+	rv = -EBUSY;
+	if (lkb->lkb_wait_type)
+		goto out;
+
+ out_ok:
+	lkb->lkb_exflags = args->flags;
+	lkb->lkb_sbflags = 0;
+	lkb->lkb_astparam = args->astparam;
+
+	rv = 0;
+ out:
+	return rv;
+}
+
+/*
+ * Four stage 4 varieties:
+ * do_request(), do_convert(), do_unlock(), do_cancel()
+ * These are called on the master node for the given lock and
+ * from the central locking logic.
+ */
+
+static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error = 0;
+
+	if (can_be_granted(r, lkb, 1)) {
+		grant_lock(r, lkb);
+		queue_cast(r, lkb, 0);
+		goto out;
+	}
+
+	if (can_be_queued(lkb)) {
+		error = -EINPROGRESS;
+		add_lkb(r, lkb, DLM_LKSTS_WAITING);
+		send_blocking_asts(r, lkb);
+		goto out;
+	}
+
+	error = -EAGAIN;
+	if (force_blocking_asts(lkb))
+		send_blocking_asts_all(r, lkb);
+	queue_cast(r, lkb, -EAGAIN);
+
+ out:
+	return error;
+}
+
+static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error = 0;
+
+	/* changing an existing lock may allow others to be granted */
+
+	if (can_be_granted(r, lkb, 1)) {
+		grant_lock(r, lkb);
+		queue_cast(r, lkb, 0);
+		grant_pending_locks(r);
+		goto out;
+	}
+
+	if (can_be_queued(lkb)) {
+		if (is_demoted(lkb))
+			grant_pending_locks(r);
+		error = -EINPROGRESS;
+		del_lkb(r, lkb);
+		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+		send_blocking_asts(r, lkb);
+		goto out;
+	}
+
+	error = -EAGAIN;
+	if (force_blocking_asts(lkb))
+		send_blocking_asts_all(r, lkb);
+	queue_cast(r, lkb, -EAGAIN);
+
+ out:
+	return error;
+}
+
+static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	remove_lock(r, lkb);
+	queue_cast(r, lkb, -DLM_EUNLOCK);
+	grant_pending_locks(r);
+	return -DLM_EUNLOCK;
+}
+
+/* FIXME: if revert_lock() finds that the lkb is granted, we should
+   skip the queue_cast(ECANCEL).  It indicates that the request/convert
+   completed (and queued a normal ast) just before the cancel; we don't
+   want to clobber the sb_result for the normal ast with ECANCEL. */
+ 
+static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	revert_lock(r, lkb);
+	queue_cast(r, lkb, -DLM_ECANCEL);
+	grant_pending_locks(r);
+	return -DLM_ECANCEL;
+}
+
+/*
+ * Four stage 3 varieties:
+ * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
+ */
+
+/* add a new lkb to a possibly new rsb, called by requesting process */
+
+static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error;
+
+	/* set_master: sets lkb nodeid from r */
+
+	error = set_master(r, lkb);
+	if (error < 0)
+		goto out;
+	if (error) {
+		error = 0;
+		goto out;
+	}
+
+	if (is_remote(r))
+		/* receive_request() calls do_request() on remote node */
+		error = send_request(r, lkb);
+	else
+		error = do_request(r, lkb);
+ out:
+	return error;
+}
+
+/* change some property of an existing lkb, e.g. mode */
+
+static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error;
+
+	if (is_remote(r))
+		/* receive_convert() calls do_convert() on remote node */
+		error = send_convert(r, lkb);
+	else
+		error = do_convert(r, lkb);
+
+	return error;
+}
+
+/* remove an existing lkb from the granted queue */
+
+static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error;
+
+	if (is_remote(r))
+		/* receive_unlock() calls do_unlock() on remote node */
+		error = send_unlock(r, lkb);
+	else
+		error = do_unlock(r, lkb);
+
+	return error;
+}
+
+/* remove an existing lkb from the convert or wait queue */
+
+static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error;
+
+	if (is_remote(r))
+		/* receive_cancel() calls do_cancel() on remote node */
+		error = send_cancel(r, lkb);
+	else
+		error = do_cancel(r, lkb);
+
+	return error;
+}
+
+/*
+ * Four stage 2 varieties:
+ * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
+ */
+
+static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
+			int len, struct dlm_args *args)
+{
+	struct dlm_rsb *r;
+	int error;
+
+	error = validate_lock_args(ls, lkb, args);
+	if (error)
+		goto out;
+
+	error = find_rsb(ls, name, len, R_CREATE, &r);
+	if (error)
+		goto out;
+
+	lock_rsb(r);
+
+	attach_lkb(r, lkb);
+	lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
+
+	error = _request_lock(r, lkb);
+
+	unlock_rsb(r);
+	put_rsb(r);
+
+ out:
+	return error;
+}
+
+static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+			struct dlm_args *args)
+{
+	struct dlm_rsb *r;
+	int error;
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_lock_args(ls, lkb, args);
+	if (error)
+		goto out;
+
+	error = _convert_lock(r, lkb);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	return error;
+}
+
+static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+		       struct dlm_args *args)
+{
+	struct dlm_rsb *r;
+	int error;
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_unlock_args(lkb, args);
+	if (error)
+		goto out;
+
+	error = _unlock_lock(r, lkb);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	return error;
+}
+
+static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+		       struct dlm_args *args)
+{
+	struct dlm_rsb *r;
+	int error;
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = validate_unlock_args(lkb, args);
+	if (error)
+		goto out;
+
+	error = _cancel_lock(r, lkb);
+ out:
+	unlock_rsb(r);
+	put_rsb(r);
+	return error;
+}
+
+/*
+ * Two stage 1 varieties:  dlm_lock() and dlm_unlock()
+ */
+
+int dlm_lock(dlm_lockspace_t *lockspace,
+	     int mode,
+	     struct dlm_lksb *lksb,
+	     uint32_t flags,
+	     void *name,
+	     unsigned int namelen,
+	     uint32_t parent_lkid,
+	     void (*ast) (void *astarg),
+	     void *astarg,
+	     void (*bast) (void *astarg, int mode))
+{
+	struct dlm_ls *ls;
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	int error, convert = flags & DLM_LKF_CONVERT;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -EINVAL;
+
+	lock_recovery(ls);
+
+	if (convert)
+		error = find_lkb(ls, lksb->sb_lkid, &lkb);
+	else
+		error = create_lkb(ls, &lkb);
+
+	if (error)
+		goto out;
+
+	error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast,
+			      astarg, bast, &args);
+	if (error)
+		goto out_put;
+
+	if (convert)
+		error = convert_lock(ls, lkb, &args);
+	else
+		error = request_lock(ls, lkb, name, namelen, &args);
+
+	if (error == -EINPROGRESS)
+		error = 0;
+ out_put:
+	if (convert || error)
+		__put_lkb(ls, lkb);
+	if (error == -EAGAIN)
+		error = 0;
+ out:
+	unlock_recovery(ls);
+	dlm_put_lockspace(ls);
+	return error;
+}
+
+int dlm_unlock(dlm_lockspace_t *lockspace,
+	       uint32_t lkid,
+	       uint32_t flags,
+	       struct dlm_lksb *lksb,
+	       void *astarg)
+{
+	struct dlm_ls *ls;
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	int error;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -EINVAL;
+
+	lock_recovery(ls);
+
+	error = find_lkb(ls, lkid, &lkb);
+	if (error)
+		goto out;
+
+	error = set_unlock_args(flags, astarg, &args);
+	if (error)
+		goto out_put;
+
+	if (flags & DLM_LKF_CANCEL)
+		error = cancel_lock(ls, lkb, &args);
+	else
+		error = unlock_lock(ls, lkb, &args);
+
+	if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
+		error = 0;
+ out_put:
+	dlm_put_lkb(lkb);
+ out:
+	unlock_recovery(ls);
+	dlm_put_lockspace(ls);
+	return error;
+}
+
+/*
+ * send/receive routines for remote operations and replies
+ *
+ * send_args
+ * send_common
+ * send_request			receive_request
+ * send_convert			receive_convert
+ * send_unlock			receive_unlock
+ * send_cancel			receive_cancel
+ * send_grant			receive_grant
+ * send_bast			receive_bast
+ * send_lookup			receive_lookup
+ * send_remove			receive_remove
+ *
+ * 				send_common_reply
+ * receive_request_reply	send_request_reply
+ * receive_convert_reply	send_convert_reply
+ * receive_unlock_reply		send_unlock_reply
+ * receive_cancel_reply		send_cancel_reply
+ * receive_lookup_reply		send_lookup_reply
+ */
+
+static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			  int to_nodeid, int mstype,
+			  struct dlm_message **ms_ret,
+			  struct dlm_mhandle **mh_ret)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	char *mb;
+	int mb_len = sizeof(struct dlm_message);
+
+	switch (mstype) {
+	case DLM_MSG_REQUEST:
+	case DLM_MSG_LOOKUP:
+	case DLM_MSG_REMOVE:
+		mb_len += r->res_length;
+		break;
+	case DLM_MSG_CONVERT:
+	case DLM_MSG_UNLOCK:
+	case DLM_MSG_REQUEST_REPLY:
+	case DLM_MSG_CONVERT_REPLY:
+	case DLM_MSG_GRANT:
+		if (lkb && lkb->lkb_lvbptr)
+			mb_len += r->res_ls->ls_lvblen;
+		break;
+	}
+
+	/* get_buffer gives us a message handle (mh) that we need to
+	   pass into lowcomms_commit and a message buffer (mb) that we
+	   write our data into */
+
+	mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
+	if (!mh)
+		return -ENOBUFS;
+
+	memset(mb, 0, mb_len);
+
+	ms = (struct dlm_message *) mb;
+
+	ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+	ms->m_header.h_lockspace = r->res_ls->ls_global_id;
+	ms->m_header.h_nodeid = dlm_our_nodeid();
+	ms->m_header.h_length = mb_len;
+	ms->m_header.h_cmd = DLM_MSG;
+
+	ms->m_type = mstype;
+
+	*mh_ret = mh;
+	*ms_ret = ms;
+	return 0;
+}
+
+/* further lowcomms enhancements or alternate implementations may make
+   the return value from this function useful at some point */
+
+static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
+{
+	dlm_message_out(ms);
+	dlm_lowcomms_commit_buffer(mh);
+	return 0;
+}
+
+static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
+		      struct dlm_message *ms)
+{
+	ms->m_nodeid   = lkb->lkb_nodeid;
+	ms->m_pid      = lkb->lkb_ownpid;
+	ms->m_lkid     = lkb->lkb_id;
+	ms->m_remid    = lkb->lkb_remid;
+	ms->m_exflags  = lkb->lkb_exflags;
+	ms->m_sbflags  = lkb->lkb_sbflags;
+	ms->m_flags    = lkb->lkb_flags;
+	ms->m_lvbseq   = lkb->lkb_lvbseq;
+	ms->m_status   = lkb->lkb_status;
+	ms->m_grmode   = lkb->lkb_grmode;
+	ms->m_rqmode   = lkb->lkb_rqmode;
+	ms->m_hash     = r->res_hash;
+
+	/* m_result and m_bastmode are set from function args,
+	   not from lkb fields */
+
+	if (lkb->lkb_bastaddr)
+		ms->m_asts |= AST_BAST;
+	if (lkb->lkb_astaddr)
+		ms->m_asts |= AST_COMP;
+
+	if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
+		memcpy(ms->m_extra, r->res_name, r->res_length);
+
+	else if (lkb->lkb_lvbptr)
+		memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+
+}
+
+static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int to_nodeid, error;
+
+	add_to_waiters(lkb, mstype);
+
+	to_nodeid = r->res_nodeid;
+
+	error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+	if (error)
+		goto fail;
+
+	send_args(r, lkb, ms);
+
+	error = send_message(mh, ms);
+	if (error)
+		goto fail;
+	return 0;
+
+ fail:
+	remove_from_waiters(lkb);
+	return error;
+}
+
+static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	return send_common(r, lkb, DLM_MSG_REQUEST);
+}
+
+static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	int error;
+
+	error = send_common(r, lkb, DLM_MSG_CONVERT);
+
+	/* down conversions go without a reply from the master */
+	if (!error && down_conversion(lkb)) {
+		remove_from_waiters(lkb);
+		r->res_ls->ls_stub_ms.m_result = 0;
+		r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
+		__receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
+	}
+
+	return error;
+}
+
+/* FIXME: if this lkb is the only lock we hold on the rsb, then set
+   MASTER_UNCERTAIN to force the next request on the rsb to confirm
+   that the master is still correct. */
+
+static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	return send_common(r, lkb, DLM_MSG_UNLOCK);
+}
+
+static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	return send_common(r, lkb, DLM_MSG_CANCEL);
+}
+
+static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int to_nodeid, error;
+
+	to_nodeid = lkb->lkb_nodeid;
+
+	error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
+	if (error)
+		goto out;
+
+	send_args(r, lkb, ms);
+
+	ms->m_result = 0;
+
+	error = send_message(mh, ms);
+ out:
+	return error;
+}
+
+static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int to_nodeid, error;
+
+	to_nodeid = lkb->lkb_nodeid;
+
+	error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
+	if (error)
+		goto out;
+
+	send_args(r, lkb, ms);
+
+	ms->m_bastmode = mode;
+
+	error = send_message(mh, ms);
+ out:
+	return error;
+}
+
+static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int to_nodeid, error;
+
+	add_to_waiters(lkb, DLM_MSG_LOOKUP);
+
+	to_nodeid = dlm_dir_nodeid(r);
+
+	error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
+	if (error)
+		goto fail;
+
+	send_args(r, lkb, ms);
+
+	error = send_message(mh, ms);
+	if (error)
+		goto fail;
+	return 0;
+
+ fail:
+	remove_from_waiters(lkb);
+	return error;
+}
+
+static int send_remove(struct dlm_rsb *r)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int to_nodeid, error;
+
+	to_nodeid = dlm_dir_nodeid(r);
+
+	error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
+	if (error)
+		goto out;
+
+	memcpy(ms->m_extra, r->res_name, r->res_length);
+	ms->m_hash = r->res_hash;
+
+	error = send_message(mh, ms);
+ out:
+	return error;
+}
+
+static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			     int mstype, int rv)
+{
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int to_nodeid, error;
+
+	to_nodeid = lkb->lkb_nodeid;
+
+	error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
+	if (error)
+		goto out;
+
+	send_args(r, lkb, ms);
+
+	ms->m_result = rv;
+
+	error = send_message(mh, ms);
+ out:
+	return error;
+}
+
+static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+	return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
+}
+
+static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+	return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
+}
+
+static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+	return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
+}
+
+static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
+{
+	return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
+}
+
+static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
+			     int ret_nodeid, int rv)
+{
+	struct dlm_rsb *r = &ls->ls_stub_rsb;
+	struct dlm_message *ms;
+	struct dlm_mhandle *mh;
+	int error, nodeid = ms_in->m_header.h_nodeid;
+
+	error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
+	if (error)
+		goto out;
+
+	ms->m_lkid = ms_in->m_lkid;
+	ms->m_result = rv;
+	ms->m_nodeid = ret_nodeid;
+
+	error = send_message(mh, ms);
+ out:
+	return error;
+}
+
+/* which args we save from a received message depends heavily on the type
+   of message, unlike the send side where we can safely send everything about
+   the lkb for any type of message */
+
+static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	lkb->lkb_exflags = ms->m_exflags;
+	lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
+		         (ms->m_flags & 0x0000FFFF);
+}
+
+static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	lkb->lkb_sbflags = ms->m_sbflags;
+	lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
+		         (ms->m_flags & 0x0000FFFF);
+}
+
+static int receive_extralen(struct dlm_message *ms)
+{
+	return (ms->m_header.h_length - sizeof(struct dlm_message));
+}
+
+static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
+		       struct dlm_message *ms)
+{
+	int len;
+
+	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+		if (!lkb->lkb_lvbptr)
+			lkb->lkb_lvbptr = allocate_lvb(ls);
+		if (!lkb->lkb_lvbptr)
+			return -ENOMEM;
+		len = receive_extralen(ms);
+		memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
+	}
+	return 0;
+}
+
+static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+				struct dlm_message *ms)
+{
+	lkb->lkb_nodeid = ms->m_header.h_nodeid;
+	lkb->lkb_ownpid = ms->m_pid;
+	lkb->lkb_remid = ms->m_lkid;
+	lkb->lkb_grmode = DLM_LOCK_IV;
+	lkb->lkb_rqmode = ms->m_rqmode;
+	lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST);
+	lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP);
+
+	DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
+
+	if (receive_lvb(ls, lkb, ms))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+				struct dlm_message *ms)
+{
+	if (lkb->lkb_nodeid != ms->m_header.h_nodeid) {
+		log_error(ls, "convert_args nodeid %d %d lkid %x %x",
+			  lkb->lkb_nodeid, ms->m_header.h_nodeid,
+			  lkb->lkb_id, lkb->lkb_remid);
+		return -EINVAL;
+	}
+
+	if (!is_master_copy(lkb))
+		return -EINVAL;
+
+	if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+		return -EBUSY;
+
+	if (receive_lvb(ls, lkb, ms))
+		return -ENOMEM;
+
+	lkb->lkb_rqmode = ms->m_rqmode;
+	lkb->lkb_lvbseq = ms->m_lvbseq;
+
+	return 0;
+}
+
+static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+			       struct dlm_message *ms)
+{
+	if (!is_master_copy(lkb))
+		return -EINVAL;
+	if (receive_lvb(ls, lkb, ms))
+		return -ENOMEM;
+	return 0;
+}
+
+/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
+   uses to send a reply and that the remote end uses to process the reply. */
+
+static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb = &ls->ls_stub_lkb;
+	lkb->lkb_nodeid = ms->m_header.h_nodeid;
+	lkb->lkb_remid = ms->m_lkid;
+}
+
+static void receive_request(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error, namelen;
+
+	error = create_lkb(ls, &lkb);
+	if (error)
+		goto fail;
+
+	receive_flags(lkb, ms);
+	lkb->lkb_flags |= DLM_IFL_MSTCPY;
+	error = receive_request_args(ls, lkb, ms);
+	if (error) {
+		__put_lkb(ls, lkb);
+		goto fail;
+	}
+
+	namelen = receive_extralen(ms);
+
+	error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r);
+	if (error) {
+		__put_lkb(ls, lkb);
+		goto fail;
+	}
+
+	lock_rsb(r);
+
+	attach_lkb(r, lkb);
+	error = do_request(r, lkb);
+	send_request_reply(r, lkb, error);
+
+	unlock_rsb(r);
+	put_rsb(r);
+
+	if (error == -EINPROGRESS)
+		error = 0;
+	if (error)
+		dlm_put_lkb(lkb);
+	return;
+
+ fail:
+	setup_stub_lkb(ls, ms);
+	send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error, reply = 1;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error)
+		goto fail;
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	receive_flags(lkb, ms);
+	error = receive_convert_args(ls, lkb, ms);
+	if (error)
+		goto out;
+	reply = !down_conversion(lkb);
+
+	error = do_convert(r, lkb);
+ out:
+	if (reply)
+		send_convert_reply(r, lkb, error);
+
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+	return;
+
+ fail:
+	setup_stub_lkb(ls, ms);
+	send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error)
+		goto fail;
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	receive_flags(lkb, ms);
+	error = receive_unlock_args(ls, lkb, ms);
+	if (error)
+		goto out;
+
+	error = do_unlock(r, lkb);
+ out:
+	send_unlock_reply(r, lkb, error);
+
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+	return;
+
+ fail:
+	setup_stub_lkb(ls, ms);
+	send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error)
+		goto fail;
+
+	receive_flags(lkb, ms);
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	error = do_cancel(r, lkb);
+	send_cancel_reply(r, lkb, error);
+
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+	return;
+
+ fail:
+	setup_stub_lkb(ls, ms);
+	send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
+}
+
+static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error) {
+		log_error(ls, "receive_grant no lkb");
+		return;
+	}
+	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	receive_flags_reply(lkb, ms);
+	grant_lock_pc(r, lkb, ms);
+	queue_cast(r, lkb, 0);
+
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+}
+
+static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error) {
+		log_error(ls, "receive_bast no lkb");
+		return;
+	}
+	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+	r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	queue_bast(r, lkb, ms->m_bastmode);
+
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+}
+
+static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid;
+
+	from_nodeid = ms->m_header.h_nodeid;
+	our_nodeid = dlm_our_nodeid();
+
+	len = receive_extralen(ms);
+
+	dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
+	if (dir_nodeid != our_nodeid) {
+		log_error(ls, "lookup dir_nodeid %d from %d",
+			  dir_nodeid, from_nodeid);
+		error = -EINVAL;
+		ret_nodeid = -1;
+		goto out;
+	}
+
+	error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid);
+
+	/* Optimization: we're master so treat lookup as a request */
+	if (!error && ret_nodeid == our_nodeid) {
+		receive_request(ls, ms);
+		return;
+	}
+ out:
+	send_lookup_reply(ls, ms, ret_nodeid, error);
+}
+
+static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	int len, dir_nodeid, from_nodeid;
+
+	from_nodeid = ms->m_header.h_nodeid;
+
+	len = receive_extralen(ms);
+
+	dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash);
+	if (dir_nodeid != dlm_our_nodeid()) {
+		log_error(ls, "remove dir entry dir_nodeid %d from %d",
+			  dir_nodeid, from_nodeid);
+		return;
+	}
+
+	dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len);
+}
+
+static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error, mstype;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error) {
+		log_error(ls, "receive_request_reply no lkb");
+		return;
+	}
+	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+	mstype = lkb->lkb_wait_type;
+	error = remove_from_waiters(lkb);
+	if (error) {
+		log_error(ls, "receive_request_reply not on waiters");
+		goto out;
+	}
+
+	/* this is the value returned from do_request() on the master */
+	error = ms->m_result;
+
+	r = lkb->lkb_resource;
+	hold_rsb(r);
+	lock_rsb(r);
+
+	/* Optimization: the dir node was also the master, so it took our
+	   lookup as a request and sent request reply instead of lookup reply */
+	if (mstype == DLM_MSG_LOOKUP) {
+		r->res_nodeid = ms->m_header.h_nodeid;
+		lkb->lkb_nodeid = r->res_nodeid;
+	}
+
+	switch (error) {
+	case -EAGAIN:
+		/* request would block (be queued) on remote master;
+		   the unhold undoes the original ref from create_lkb()
+		   so it leads to the lkb being freed */
+		queue_cast(r, lkb, -EAGAIN);
+		confirm_master(r, -EAGAIN);
+		unhold_lkb(lkb);
+		break;
+
+	case -EINPROGRESS:
+	case 0:
+		/* request was queued or granted on remote master */
+		receive_flags_reply(lkb, ms);
+		lkb->lkb_remid = ms->m_lkid;
+		if (error)
+			add_lkb(r, lkb, DLM_LKSTS_WAITING);
+		else {
+			grant_lock_pc(r, lkb, ms);
+			queue_cast(r, lkb, 0);
+		}
+		confirm_master(r, error);
+		break;
+
+	case -EBADR:
+	case -ENOTBLK:
+		/* find_rsb failed to find rsb or rsb wasn't master */
+		r->res_nodeid = -1;
+		lkb->lkb_nodeid = -1;
+		_request_lock(r, lkb);
+		break;
+
+	default:
+		log_error(ls, "receive_request_reply error %d", error);
+	}
+
+	unlock_rsb(r);
+	put_rsb(r);
+ out:
+	dlm_put_lkb(lkb);
+}
+
+static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
+				    struct dlm_message *ms)
+{
+	int error = ms->m_result;
+
+	/* this is the value returned from do_convert() on the master */
+
+	switch (error) {
+	case -EAGAIN:
+		/* convert would block (be queued) on remote master */
+		queue_cast(r, lkb, -EAGAIN);
+		break;
+
+	case -EINPROGRESS:
+		/* convert was queued on remote master */
+		del_lkb(r, lkb);
+		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+		break;
+
+	case 0:
+		/* convert was granted on remote master */
+		receive_flags_reply(lkb, ms);
+		grant_lock_pc(r, lkb, ms);
+		queue_cast(r, lkb, 0);
+		break;
+
+	default:
+		log_error(r->res_ls, "receive_convert_reply error %d", error);
+	}
+}
+
+static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	struct dlm_rsb *r = lkb->lkb_resource;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	__receive_convert_reply(r, lkb, ms);
+
+	unlock_rsb(r);
+	put_rsb(r);
+}
+
+static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error) {
+		log_error(ls, "receive_convert_reply no lkb");
+		return;
+	}
+	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+	error = remove_from_waiters(lkb);
+	if (error) {
+		log_error(ls, "receive_convert_reply not on waiters");
+		goto out;
+	}
+
+	_receive_convert_reply(lkb, ms);
+ out:
+	dlm_put_lkb(lkb);
+}
+
+static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	struct dlm_rsb *r = lkb->lkb_resource;
+	int error = ms->m_result;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	/* this is the value returned from do_unlock() on the master */
+
+	switch (error) {
+	case -DLM_EUNLOCK:
+		receive_flags_reply(lkb, ms);
+		remove_lock_pc(r, lkb);
+		queue_cast(r, lkb, -DLM_EUNLOCK);
+		break;
+	default:
+		log_error(r->res_ls, "receive_unlock_reply error %d", error);
+	}
+
+	unlock_rsb(r);
+	put_rsb(r);
+}
+
+static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error) {
+		log_error(ls, "receive_unlock_reply no lkb");
+		return;
+	}
+	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+	error = remove_from_waiters(lkb);
+	if (error) {
+		log_error(ls, "receive_unlock_reply not on waiters");
+		goto out;
+	}
+
+	_receive_unlock_reply(lkb, ms);
+ out:
+	dlm_put_lkb(lkb);
+}
+
+static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
+{
+	struct dlm_rsb *r = lkb->lkb_resource;
+	int error = ms->m_result;
+
+	hold_rsb(r);
+	lock_rsb(r);
+
+	/* this is the value returned from do_cancel() on the master */
+
+	switch (error) {
+	case -DLM_ECANCEL:
+		receive_flags_reply(lkb, ms);
+		revert_lock_pc(r, lkb);
+		queue_cast(r, lkb, -DLM_ECANCEL);
+		break;
+	default:
+		log_error(r->res_ls, "receive_cancel_reply error %d", error);
+	}
+
+	unlock_rsb(r);
+	put_rsb(r);
+}
+
+static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	int error;
+
+	error = find_lkb(ls, ms->m_remid, &lkb);
+	if (error) {
+		log_error(ls, "receive_cancel_reply no lkb");
+		return;
+	}
+	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+	error = remove_from_waiters(lkb);
+	if (error) {
+		log_error(ls, "receive_cancel_reply not on waiters");
+		goto out;
+	}
+
+	_receive_cancel_reply(lkb, ms);
+ out:
+	dlm_put_lkb(lkb);
+}
+
+static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error, ret_nodeid;
+
+	error = find_lkb(ls, ms->m_lkid, &lkb);
+	if (error) {
+		log_error(ls, "receive_lookup_reply no lkb");
+		return;
+	}
+
+	error = remove_from_waiters(lkb);
+	if (error) {
+		log_error(ls, "receive_lookup_reply not on waiters");
+		goto out;
+	}
+
+	/* this is the value returned by dlm_dir_lookup on dir node
+	   FIXME: will a non-zero error ever be returned? */
+	error = ms->m_result;
+
+	r = lkb->lkb_resource;
+	hold_rsb(r);
+	lock_rsb(r);
+
+	ret_nodeid = ms->m_nodeid;
+	if (ret_nodeid == dlm_our_nodeid()) {
+		r->res_nodeid = 0;
+		ret_nodeid = 0;
+		r->res_first_lkid = 0;
+	} else {
+		/* set_master() will copy res_nodeid to lkb_nodeid */
+		r->res_nodeid = ret_nodeid;
+	}
+
+	_request_lock(r, lkb);
+
+	if (!ret_nodeid)
+		process_lookup_list(r);
+
+	unlock_rsb(r);
+	put_rsb(r);
+ out:
+	dlm_put_lkb(lkb);
+}
+
+int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
+{
+	struct dlm_message *ms = (struct dlm_message *) hd;
+	struct dlm_ls *ls;
+	int error;
+
+	if (!recovery)
+		dlm_message_in(ms);
+
+	ls = dlm_find_lockspace_global(hd->h_lockspace);
+	if (!ls) {
+		log_print("drop message %d from %d for unknown lockspace %d",
+			  ms->m_type, nodeid, hd->h_lockspace);
+		return -EINVAL;
+	}
+
+	/* recovery may have just ended leaving a bunch of backed-up requests
+	   in the requestqueue; wait while dlm_recoverd clears them */
+
+	if (!recovery)
+		dlm_wait_requestqueue(ls);
+
+	/* recovery may have just started while there were a bunch of
+	   in-flight requests -- save them in requestqueue to be processed
+	   after recovery.  we can't let dlm_recvd block on the recovery
+	   lock.  if dlm_recoverd is calling this function to clear the
+	   requestqueue, it needs to be interrupted (-EINTR) if another
+	   recovery operation is starting. */
+
+	while (1) {
+		if (dlm_locking_stopped(ls)) {
+			if (!recovery)
+				dlm_add_requestqueue(ls, nodeid, hd);
+			error = -EINTR;
+			goto out;
+		}
+
+		if (lock_recovery_try(ls))
+			break;
+		schedule();
+	}
+
+	switch (ms->m_type) {
+
+	/* messages sent to a master node */
+
+	case DLM_MSG_REQUEST:
+		receive_request(ls, ms);
+		break;
+
+	case DLM_MSG_CONVERT:
+		receive_convert(ls, ms);
+		break;
+
+	case DLM_MSG_UNLOCK:
+		receive_unlock(ls, ms);
+		break;
+
+	case DLM_MSG_CANCEL:
+		receive_cancel(ls, ms);
+		break;
+
+	/* messages sent from a master node (replies to above) */
+
+	case DLM_MSG_REQUEST_REPLY:
+		receive_request_reply(ls, ms);
+		break;
+
+	case DLM_MSG_CONVERT_REPLY:
+		receive_convert_reply(ls, ms);
+		break;
+
+	case DLM_MSG_UNLOCK_REPLY:
+		receive_unlock_reply(ls, ms);
+		break;
+
+	case DLM_MSG_CANCEL_REPLY:
+		receive_cancel_reply(ls, ms);
+		break;
+
+	/* messages sent from a master node (only two types of async msg) */
+
+	case DLM_MSG_GRANT:
+		receive_grant(ls, ms);
+		break;
+
+	case DLM_MSG_BAST:
+		receive_bast(ls, ms);
+		break;
+
+	/* messages sent to a dir node */
+
+	case DLM_MSG_LOOKUP:
+		receive_lookup(ls, ms);
+		break;
+
+	case DLM_MSG_REMOVE:
+		receive_remove(ls, ms);
+		break;
+
+	/* messages sent from a dir node (remove has no reply) */
+
+	case DLM_MSG_LOOKUP_REPLY:
+		receive_lookup_reply(ls, ms);
+		break;
+
+	default:
+		log_error(ls, "unknown message type %d", ms->m_type);
+	}
+
+	unlock_recovery(ls);
+ out:
+	dlm_put_lockspace(ls);
+	dlm_astd_wake();
+	return 0;
+}
+
+
+/*
+ * Recovery related
+ */
+
+static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+	if (middle_conversion(lkb)) {
+		hold_lkb(lkb);
+		ls->ls_stub_ms.m_result = -EINPROGRESS;
+		_remove_from_waiters(lkb);
+		_receive_convert_reply(lkb, &ls->ls_stub_ms);
+
+		/* Same special case as in receive_rcom_lock_args() */
+		lkb->lkb_grmode = DLM_LOCK_IV;
+		rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
+		unhold_lkb(lkb);
+
+	} else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
+		lkb->lkb_flags |= DLM_IFL_RESEND;
+	}
+
+	/* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
+	   conversions are async; there's no reply from the remote master */
+}
+
+/* A waiting lkb needs recovery if the master node has failed, or
+   the master node is changing (only when no directory is used) */
+
+static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+	if (dlm_is_removed(ls, lkb->lkb_nodeid))
+		return 1;
+
+	if (!dlm_no_directory(ls))
+		return 0;
+
+	if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid)
+		return 1;
+
+	return 0;
+}
+
+/* Recovery for locks that are waiting for replies from nodes that are now
+   gone.  We can just complete unlocks and cancels by faking a reply from the
+   dead node.  Requests and up-conversions we flag to be resent after
+   recovery.  Down-conversions can just be completed with a fake reply like
+   unlocks.  Conversions between PR and CW need special attention. */
+
+void dlm_recover_waiters_pre(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb, *safe;
+
+	mutex_lock(&ls->ls_waiters_mutex);
+
+	list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
+		log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
+			  lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
+
+		/* all outstanding lookups, regardless of destination  will be
+		   resent after recovery is done */
+
+		if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
+			lkb->lkb_flags |= DLM_IFL_RESEND;
+			continue;
+		}
+
+		if (!waiter_needs_recovery(ls, lkb))
+			continue;
+
+		switch (lkb->lkb_wait_type) {
+
+		case DLM_MSG_REQUEST:
+			lkb->lkb_flags |= DLM_IFL_RESEND;
+			break;
+
+		case DLM_MSG_CONVERT:
+			recover_convert_waiter(ls, lkb);
+			break;
+
+		case DLM_MSG_UNLOCK:
+			hold_lkb(lkb);
+			ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
+			_remove_from_waiters(lkb);
+			_receive_unlock_reply(lkb, &ls->ls_stub_ms);
+			dlm_put_lkb(lkb);
+			break;
+
+		case DLM_MSG_CANCEL:
+			hold_lkb(lkb);
+			ls->ls_stub_ms.m_result = -DLM_ECANCEL;
+			_remove_from_waiters(lkb);
+			_receive_cancel_reply(lkb, &ls->ls_stub_ms);
+			dlm_put_lkb(lkb);
+			break;
+
+		default:
+			log_error(ls, "invalid lkb wait_type %d",
+				  lkb->lkb_wait_type);
+		}
+		schedule();
+	}
+	mutex_unlock(&ls->ls_waiters_mutex);
+}
+
+static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
+{
+	struct dlm_lkb *lkb;
+	int rv = 0;
+
+	mutex_lock(&ls->ls_waiters_mutex);
+	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+		if (lkb->lkb_flags & DLM_IFL_RESEND) {
+			rv = lkb->lkb_wait_type;
+			_remove_from_waiters(lkb);
+			lkb->lkb_flags &= ~DLM_IFL_RESEND;
+			break;
+		}
+	}
+	mutex_unlock(&ls->ls_waiters_mutex);
+
+	if (!rv)
+		lkb = NULL;
+	*lkb_ret = lkb;
+	return rv;
+}
+
+/* Deal with lookups and lkb's marked RESEND from _pre.  We may now be the
+   master or dir-node for r.  Processing the lkb may result in it being placed
+   back on waiters. */
+
+int dlm_recover_waiters_post(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *r;
+	int error = 0, mstype;
+
+	while (1) {
+		if (dlm_locking_stopped(ls)) {
+			log_debug(ls, "recover_waiters_post aborted");
+			error = -EINTR;
+			break;
+		}
+
+		mstype = remove_resend_waiter(ls, &lkb);
+		if (!mstype)
+			break;
+
+		r = lkb->lkb_resource;
+
+		log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
+			  lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
+
+		switch (mstype) {
+
+		case DLM_MSG_LOOKUP:
+			hold_rsb(r);
+			lock_rsb(r);
+			_request_lock(r, lkb);
+			if (is_master(r))
+				confirm_master(r, 0);
+			unlock_rsb(r);
+			put_rsb(r);
+			break;
+
+		case DLM_MSG_REQUEST:
+			hold_rsb(r);
+			lock_rsb(r);
+			_request_lock(r, lkb);
+			if (is_master(r))
+				confirm_master(r, 0);
+			unlock_rsb(r);
+			put_rsb(r);
+			break;
+
+		case DLM_MSG_CONVERT:
+			hold_rsb(r);
+			lock_rsb(r);
+			_convert_lock(r, lkb);
+			unlock_rsb(r);
+			put_rsb(r);
+			break;
+
+		default:
+			log_error(ls, "recover_waiters_post type %d", mstype);
+		}
+	}
+
+	return error;
+}
+
+static void purge_queue(struct dlm_rsb *r, struct list_head *queue,
+			int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb))
+{
+	struct dlm_ls *ls = r->res_ls;
+	struct dlm_lkb *lkb, *safe;
+
+	list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) {
+		if (test(ls, lkb)) {
+			rsb_set_flag(r, RSB_LOCKS_PURGED);
+			del_lkb(r, lkb);
+			/* this put should free the lkb */
+			if (!dlm_put_lkb(lkb))
+				log_error(ls, "purged lkb not released");
+		}
+	}
+}
+
+static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+	return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid));
+}
+
+static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+	return is_master_copy(lkb);
+}
+
+static void purge_dead_locks(struct dlm_rsb *r)
+{
+	purge_queue(r, &r->res_grantqueue, &purge_dead_test);
+	purge_queue(r, &r->res_convertqueue, &purge_dead_test);
+	purge_queue(r, &r->res_waitqueue, &purge_dead_test);
+}
+
+void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
+{
+	purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test);
+	purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test);
+	purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test);
+}
+
+/* Get rid of locks held by nodes that are gone. */
+
+int dlm_purge_locks(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+
+	log_debug(ls, "dlm_purge_locks");
+
+	down_write(&ls->ls_root_sem);
+	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+		hold_rsb(r);
+		lock_rsb(r);
+		if (is_master(r))
+			purge_dead_locks(r);
+		unlock_rsb(r);
+		unhold_rsb(r);
+
+		schedule();
+	}
+	up_write(&ls->ls_root_sem);
+
+	return 0;
+}
+
+static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
+{
+	struct dlm_rsb *r, *r_ret = NULL;
+
+	read_lock(&ls->ls_rsbtbl[bucket].lock);
+	list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
+		if (!rsb_flag(r, RSB_LOCKS_PURGED))
+			continue;
+		hold_rsb(r);
+		rsb_clear_flag(r, RSB_LOCKS_PURGED);
+		r_ret = r;
+		break;
+	}
+	read_unlock(&ls->ls_rsbtbl[bucket].lock);
+	return r_ret;
+}
+
+void dlm_grant_after_purge(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+	int bucket = 0;
+
+	while (1) {
+		r = find_purged_rsb(ls, bucket);
+		if (!r) {
+			if (bucket == ls->ls_rsbtbl_size - 1)
+				break;
+			bucket++;
+			continue;
+		}
+		lock_rsb(r);
+		if (is_master(r)) {
+			grant_pending_locks(r);
+			confirm_master(r, 0);
+		}
+		unlock_rsb(r);
+		put_rsb(r);
+		schedule();
+	}
+}
+
+static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
+					 uint32_t remid)
+{
+	struct dlm_lkb *lkb;
+
+	list_for_each_entry(lkb, head, lkb_statequeue) {
+		if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
+			return lkb;
+	}
+	return NULL;
+}
+
+static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
+				    uint32_t remid)
+{
+	struct dlm_lkb *lkb;
+
+	lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
+	if (lkb)
+		return lkb;
+	lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
+	if (lkb)
+		return lkb;
+	lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
+	if (lkb)
+		return lkb;
+	return NULL;
+}
+
+static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
+				  struct dlm_rsb *r, struct dlm_rcom *rc)
+{
+	struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+	int lvblen;
+
+	lkb->lkb_nodeid = rc->rc_header.h_nodeid;
+	lkb->lkb_ownpid = rl->rl_ownpid;
+	lkb->lkb_remid = rl->rl_lkid;
+	lkb->lkb_exflags = rl->rl_exflags;
+	lkb->lkb_flags = rl->rl_flags & 0x0000FFFF;
+	lkb->lkb_flags |= DLM_IFL_MSTCPY;
+	lkb->lkb_lvbseq = rl->rl_lvbseq;
+	lkb->lkb_rqmode = rl->rl_rqmode;
+	lkb->lkb_grmode = rl->rl_grmode;
+	/* don't set lkb_status because add_lkb wants to itself */
+
+	lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST);
+	lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP);
+
+	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+		lkb->lkb_lvbptr = allocate_lvb(ls);
+		if (!lkb->lkb_lvbptr)
+			return -ENOMEM;
+		lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
+			 sizeof(struct rcom_lock);
+		memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
+	}
+
+	/* Conversions between PR and CW (middle modes) need special handling.
+	   The real granted mode of these converting locks cannot be determined
+	   until all locks have been rebuilt on the rsb (recover_conversion) */
+
+	if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) {
+		rl->rl_status = DLM_LKSTS_CONVERT;
+		lkb->lkb_grmode = DLM_LOCK_IV;
+		rsb_set_flag(r, RSB_RECOVER_CONVERT);
+	}
+
+	return 0;
+}
+
+/* This lkb may have been recovered in a previous aborted recovery so we need
+   to check if the rsb already has an lkb with the given remote nodeid/lkid.
+   If so we just send back a standard reply.  If not, we create a new lkb with
+   the given values and send back our lkid.  We send back our lkid by sending
+   back the rcom_lock struct we got but with the remid field filled in. */
+
+int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+	struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+	struct dlm_rsb *r;
+	struct dlm_lkb *lkb;
+	int error;
+
+	if (rl->rl_parent_lkid) {
+		error = -EOPNOTSUPP;
+		goto out;
+	}
+
+	error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r);
+	if (error)
+		goto out;
+
+	lock_rsb(r);
+
+	lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid);
+	if (lkb) {
+		error = -EEXIST;
+		goto out_remid;
+	}
+
+	error = create_lkb(ls, &lkb);
+	if (error)
+		goto out_unlock;
+
+	error = receive_rcom_lock_args(ls, lkb, r, rc);
+	if (error) {
+		__put_lkb(ls, lkb);
+		goto out_unlock;
+	}
+
+	attach_lkb(r, lkb);
+	add_lkb(r, lkb, rl->rl_status);
+	error = 0;
+
+ out_remid:
+	/* this is the new value returned to the lock holder for
+	   saving in its process-copy lkb */
+	rl->rl_remid = lkb->lkb_id;
+
+ out_unlock:
+	unlock_rsb(r);
+	put_rsb(r);
+ out:
+	if (error)
+		log_print("recover_master_copy %d %x", error, rl->rl_lkid);
+	rl->rl_result = error;
+	return error;
+}
+
+int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+	struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
+	struct dlm_rsb *r;
+	struct dlm_lkb *lkb;
+	int error;
+
+	error = find_lkb(ls, rl->rl_lkid, &lkb);
+	if (error) {
+		log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid);
+		return error;
+	}
+
+	DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb););
+
+	error = rl->rl_result;
+
+	r = lkb->lkb_resource;
+	hold_rsb(r);
+	lock_rsb(r);
+
+	switch (error) {
+	case -EEXIST:
+		log_debug(ls, "master copy exists %x", lkb->lkb_id);
+		/* fall through */
+	case 0:
+		lkb->lkb_remid = rl->rl_remid;
+		break;
+	default:
+		log_error(ls, "dlm_recover_process_copy unknown error %d %x",
+			  error, lkb->lkb_id);
+	}
+
+	/* an ack for dlm_recover_locks() which waits for replies from
+	   all the locks it sends to new masters */
+	dlm_recovered_lock(r);
+
+	unlock_rsb(r);
+	put_rsb(r);
+	dlm_put_lkb(lkb);
+
+	return 0;
+}
+
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
+		     int mode, uint32_t flags, void *name, unsigned int namelen,
+		     uint32_t parent_lkid)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	int error;
+
+	lock_recovery(ls);
+
+	error = create_lkb(ls, &lkb);
+	if (error) {
+		kfree(ua);
+		goto out;
+	}
+
+	if (flags & DLM_LKF_VALBLK) {
+		ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+		if (!ua->lksb.sb_lvbptr) {
+			kfree(ua);
+			__put_lkb(ls, lkb);
+			error = -ENOMEM;
+			goto out;
+		}
+	}
+
+	/* After ua is attached to lkb it will be freed by free_lkb().
+	   When DLM_IFL_USER is set, the dlm knows that this is a userspace
+	   lock and that lkb_astparam is the dlm_user_args structure. */
+
+	error = set_lock_args(mode, &ua->lksb, flags, namelen, parent_lkid,
+			      DLM_FAKE_USER_AST, ua, DLM_FAKE_USER_AST, &args);
+	lkb->lkb_flags |= DLM_IFL_USER;
+	ua->old_mode = DLM_LOCK_IV;
+
+	if (error) {
+		__put_lkb(ls, lkb);
+		goto out;
+	}
+
+	error = request_lock(ls, lkb, name, namelen, &args);
+
+	switch (error) {
+	case 0:
+		break;
+	case -EINPROGRESS:
+		error = 0;
+		break;
+	case -EAGAIN:
+		error = 0;
+		/* fall through */
+	default:
+		__put_lkb(ls, lkb);
+		goto out;
+	}
+
+	/* add this new lkb to the per-process list of locks */
+	spin_lock(&ua->proc->locks_spin);
+	kref_get(&lkb->lkb_ref);
+	list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
+	spin_unlock(&ua->proc->locks_spin);
+ out:
+	unlock_recovery(ls);
+	return error;
+}
+
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+		     int mode, uint32_t flags, uint32_t lkid, char *lvb_in)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	struct dlm_user_args *ua;
+	int error;
+
+	lock_recovery(ls);
+
+	error = find_lkb(ls, lkid, &lkb);
+	if (error)
+		goto out;
+
+	/* user can change the params on its lock when it converts it, or
+	   add an lvb that didn't exist before */
+
+	ua = (struct dlm_user_args *)lkb->lkb_astparam;
+
+	if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
+		ua->lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
+		if (!ua->lksb.sb_lvbptr) {
+			error = -ENOMEM;
+			goto out_put;
+		}
+	}
+	if (lvb_in && ua->lksb.sb_lvbptr)
+		memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
+
+	ua->castparam = ua_tmp->castparam;
+	ua->castaddr = ua_tmp->castaddr;
+	ua->bastparam = ua_tmp->bastparam;
+	ua->bastaddr = ua_tmp->bastaddr;
+	ua->user_lksb = ua_tmp->user_lksb;
+	ua->old_mode = lkb->lkb_grmode;
+
+	error = set_lock_args(mode, &ua->lksb, flags, 0, 0, DLM_FAKE_USER_AST,
+			      ua, DLM_FAKE_USER_AST, &args);
+	if (error)
+		goto out_put;
+
+	error = convert_lock(ls, lkb, &args);
+
+	if (error == -EINPROGRESS || error == -EAGAIN)
+		error = 0;
+ out_put:
+	dlm_put_lkb(lkb);
+ out:
+	unlock_recovery(ls);
+	kfree(ua_tmp);
+	return error;
+}
+
+int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+		    uint32_t flags, uint32_t lkid, char *lvb_in)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	struct dlm_user_args *ua;
+	int error;
+
+	lock_recovery(ls);
+
+	error = find_lkb(ls, lkid, &lkb);
+	if (error)
+		goto out;
+
+	ua = (struct dlm_user_args *)lkb->lkb_astparam;
+
+	if (lvb_in && ua->lksb.sb_lvbptr)
+		memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
+	ua->castparam = ua_tmp->castparam;
+	ua->user_lksb = ua_tmp->user_lksb;
+
+	error = set_unlock_args(flags, ua, &args);
+	if (error)
+		goto out_put;
+
+	error = unlock_lock(ls, lkb, &args);
+
+	if (error == -DLM_EUNLOCK)
+		error = 0;
+	if (error)
+		goto out_put;
+
+	spin_lock(&ua->proc->locks_spin);
+	list_del_init(&lkb->lkb_ownqueue);
+	spin_unlock(&ua->proc->locks_spin);
+
+	/* this removes the reference for the proc->locks list added by
+	   dlm_user_request */
+	unhold_lkb(lkb);
+ out_put:
+	dlm_put_lkb(lkb);
+ out:
+	unlock_recovery(ls);
+	return error;
+}
+
+int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+		    uint32_t flags, uint32_t lkid)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_args args;
+	struct dlm_user_args *ua;
+	int error;
+
+	lock_recovery(ls);
+
+	error = find_lkb(ls, lkid, &lkb);
+	if (error)
+		goto out;
+
+	ua = (struct dlm_user_args *)lkb->lkb_astparam;
+	ua->castparam = ua_tmp->castparam;
+	ua->user_lksb = ua_tmp->user_lksb;
+
+	error = set_unlock_args(flags, ua, &args);
+	if (error)
+		goto out_put;
+
+	error = cancel_lock(ls, lkb, &args);
+
+	if (error == -DLM_ECANCEL)
+		error = 0;
+	if (error)
+		goto out_put;
+
+	/* this lkb was removed from the WAITING queue */
+	if (lkb->lkb_grmode == DLM_LOCK_IV) {
+		spin_lock(&ua->proc->locks_spin);
+		list_del_init(&lkb->lkb_ownqueue);
+		spin_unlock(&ua->proc->locks_spin);
+		unhold_lkb(lkb);
+	}
+ out_put:
+	dlm_put_lkb(lkb);
+ out:
+	unlock_recovery(ls);
+	return error;
+}
+
+static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+	struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
+
+	if (ua->lksb.sb_lvbptr)
+		kfree(ua->lksb.sb_lvbptr);
+	kfree(ua);
+	lkb->lkb_astparam = (long)NULL;
+
+	/* TODO: propogate to master if needed */
+	return 0;
+}
+
+/* The force flag allows the unlock to go ahead even if the lkb isn't granted.
+   Regardless of what rsb queue the lock is on, it's removed and freed. */
+
+static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
+{
+	struct dlm_user_args *ua = (struct dlm_user_args *)lkb->lkb_astparam;
+	struct dlm_args args;
+	int error;
+
+	/* FIXME: we need to handle the case where the lkb is in limbo
+	   while the rsb is being looked up, currently we assert in
+	   _unlock_lock/is_remote because rsb nodeid is -1. */
+
+	set_unlock_args(DLM_LKF_FORCEUNLOCK, ua, &args);
+
+	error = unlock_lock(ls, lkb, &args);
+	if (error == -DLM_EUNLOCK)
+		error = 0;
+	return error;
+}
+
+/* The ls_clear_proc_locks mutex protects against dlm_user_add_asts() which
+   1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
+   which we clear here. */
+
+/* proc CLOSING flag is set so no more device_reads should look at proc->asts
+   list, and no more device_writes should add lkb's to proc->locks list; so we
+   shouldn't need to take asts_spin or locks_spin here.  this assumes that
+   device reads/writes/closes are serialized -- FIXME: we may need to serialize
+   them ourself. */
+
+void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
+{
+	struct dlm_lkb *lkb, *safe;
+
+	lock_recovery(ls);
+	mutex_lock(&ls->ls_clear_proc_locks);
+
+	list_for_each_entry_safe(lkb, safe, &proc->locks, lkb_ownqueue) {
+		if (lkb->lkb_ast_type) {
+			list_del(&lkb->lkb_astqueue);
+			unhold_lkb(lkb);
+		}
+
+		list_del_init(&lkb->lkb_ownqueue);
+
+		if (lkb->lkb_exflags & DLM_LKF_PERSISTENT) {
+			lkb->lkb_flags |= DLM_IFL_ORPHAN;
+			orphan_proc_lock(ls, lkb);
+		} else {
+			lkb->lkb_flags |= DLM_IFL_DEAD;
+			unlock_proc_lock(ls, lkb);
+		}
+
+		/* this removes the reference for the proc->locks list
+		   added by dlm_user_request, it may result in the lkb
+		   being freed */
+
+		dlm_put_lkb(lkb);
+	}
+	mutex_unlock(&ls->ls_clear_proc_locks);
+	unlock_recovery(ls);
+}
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
new file mode 100644
index 000000000000..0843a3073ec3
--- /dev/null
+++ b/fs/dlm/lock.h
@@ -0,0 +1,62 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCK_DOT_H__
+#define __LOCK_DOT_H__
+
+void dlm_print_rsb(struct dlm_rsb *r);
+void dlm_dump_rsb(struct dlm_rsb *r);
+void dlm_print_lkb(struct dlm_lkb *lkb);
+int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery);
+int dlm_modes_compat(int mode1, int mode2);
+int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
+	unsigned int flags, struct dlm_rsb **r_ret);
+void dlm_put_rsb(struct dlm_rsb *r);
+void dlm_hold_rsb(struct dlm_rsb *r);
+int dlm_put_lkb(struct dlm_lkb *lkb);
+void dlm_scan_rsbs(struct dlm_ls *ls);
+
+int dlm_purge_locks(struct dlm_ls *ls);
+void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
+void dlm_grant_after_purge(struct dlm_ls *ls);
+int dlm_recover_waiters_post(struct dlm_ls *ls);
+void dlm_recover_waiters_pre(struct dlm_ls *ls);
+int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
+	uint32_t flags, void *name, unsigned int namelen, uint32_t parent_lkid);
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+	int mode, uint32_t flags, uint32_t lkid, char *lvb_in);
+int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+	uint32_t flags, uint32_t lkid, char *lvb_in);
+int dlm_user_cancel(struct dlm_ls *ls,  struct dlm_user_args *ua_tmp,
+	uint32_t flags, uint32_t lkid);
+void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc);
+
+static inline int is_master(struct dlm_rsb *r)
+{
+	return !r->res_nodeid;
+}
+
+static inline void lock_rsb(struct dlm_rsb *r)
+{
+	mutex_lock(&r->res_mutex);
+}
+
+static inline void unlock_rsb(struct dlm_rsb *r)
+{
+	mutex_unlock(&r->res_mutex);
+}
+
+#endif
+
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
new file mode 100644
index 000000000000..109333c8ecb9
--- /dev/null
+++ b/fs/dlm/lockspace.c
@@ -0,0 +1,717 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "recoverd.h"
+#include "ast.h"
+#include "dir.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "memory.h"
+#include "lock.h"
+#include "recover.h"
+
+#ifdef CONFIG_DLM_DEBUG
+int dlm_create_debug_file(struct dlm_ls *ls);
+void dlm_delete_debug_file(struct dlm_ls *ls);
+#else
+static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; }
+static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
+#endif
+
+static int			ls_count;
+static struct mutex		ls_lock;
+static struct list_head		lslist;
+static spinlock_t		lslist_lock;
+static struct task_struct *	scand_task;
+
+
+static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+	ssize_t ret = len;
+	int n = simple_strtol(buf, NULL, 0);
+
+	switch (n) {
+	case 0:
+		dlm_ls_stop(ls);
+		break;
+	case 1:
+		dlm_ls_start(ls);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+	ls->ls_uevent_result = simple_strtol(buf, NULL, 0);
+	set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
+	wake_up(&ls->ls_uevent_wait);
+	return len;
+}
+
+static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n", ls->ls_global_id);
+}
+
+static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
+{
+	ls->ls_global_id = simple_strtoul(buf, NULL, 0);
+	return len;
+}
+
+static ssize_t dlm_recover_status_show(struct dlm_ls *ls, char *buf)
+{
+	uint32_t status = dlm_recover_status(ls);
+	return snprintf(buf, PAGE_SIZE, "%x\n", status);
+}
+
+static ssize_t dlm_recover_nodeid_show(struct dlm_ls *ls, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", ls->ls_recover_nodeid);
+}
+
+struct dlm_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct dlm_ls *, char *);
+	ssize_t (*store)(struct dlm_ls *, const char *, size_t);
+};
+
+static struct dlm_attr dlm_attr_control = {
+	.attr  = {.name = "control", .mode = S_IWUSR},
+	.store = dlm_control_store
+};
+
+static struct dlm_attr dlm_attr_event = {
+	.attr  = {.name = "event_done", .mode = S_IWUSR},
+	.store = dlm_event_store
+};
+
+static struct dlm_attr dlm_attr_id = {
+	.attr  = {.name = "id", .mode = S_IRUGO | S_IWUSR},
+	.show  = dlm_id_show,
+	.store = dlm_id_store
+};
+
+static struct dlm_attr dlm_attr_recover_status = {
+	.attr  = {.name = "recover_status", .mode = S_IRUGO},
+	.show  = dlm_recover_status_show
+};
+
+static struct dlm_attr dlm_attr_recover_nodeid = {
+	.attr  = {.name = "recover_nodeid", .mode = S_IRUGO},
+	.show  = dlm_recover_nodeid_show
+};
+
+static struct attribute *dlm_attrs[] = {
+	&dlm_attr_control.attr,
+	&dlm_attr_event.attr,
+	&dlm_attr_id.attr,
+	&dlm_attr_recover_status.attr,
+	&dlm_attr_recover_nodeid.attr,
+	NULL,
+};
+
+static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr,
+			     char *buf)
+{
+	struct dlm_ls *ls  = container_of(kobj, struct dlm_ls, ls_kobj);
+	struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
+	return a->show ? a->show(ls, buf) : 0;
+}
+
+static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr,
+			      const char *buf, size_t len)
+{
+	struct dlm_ls *ls  = container_of(kobj, struct dlm_ls, ls_kobj);
+	struct dlm_attr *a = container_of(attr, struct dlm_attr, attr);
+	return a->store ? a->store(ls, buf, len) : len;
+}
+
+static struct sysfs_ops dlm_attr_ops = {
+	.show  = dlm_attr_show,
+	.store = dlm_attr_store,
+};
+
+static struct kobj_type dlm_ktype = {
+	.default_attrs = dlm_attrs,
+	.sysfs_ops     = &dlm_attr_ops,
+};
+
+static struct kset dlm_kset = {
+	.subsys = &kernel_subsys,
+	.kobj   = {.name = "dlm",},
+	.ktype  = &dlm_ktype,
+};
+
+static int kobject_setup(struct dlm_ls *ls)
+{
+	char lsname[DLM_LOCKSPACE_LEN];
+	int error;
+
+	memset(lsname, 0, DLM_LOCKSPACE_LEN);
+	snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name);
+
+	error = kobject_set_name(&ls->ls_kobj, "%s", lsname);
+	if (error)
+		return error;
+
+	ls->ls_kobj.kset = &dlm_kset;
+	ls->ls_kobj.ktype = &dlm_ktype;
+	return 0;
+}
+
+static int do_uevent(struct dlm_ls *ls, int in)
+{
+	int error;
+
+	if (in)
+		kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE);
+	else
+		kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
+
+	error = wait_event_interruptible(ls->ls_uevent_wait,
+			test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
+	if (error)
+		goto out;
+
+	error = ls->ls_uevent_result;
+ out:
+	return error;
+}
+
+
+int dlm_lockspace_init(void)
+{
+	int error;
+
+	ls_count = 0;
+	mutex_init(&ls_lock);
+	INIT_LIST_HEAD(&lslist);
+	spin_lock_init(&lslist_lock);
+
+	error = kset_register(&dlm_kset);
+	if (error)
+		printk("dlm_lockspace_init: cannot register kset %d\n", error);
+	return error;
+}
+
+void dlm_lockspace_exit(void)
+{
+	kset_unregister(&dlm_kset);
+}
+
+static int dlm_scand(void *data)
+{
+	struct dlm_ls *ls;
+
+	while (!kthread_should_stop()) {
+		list_for_each_entry(ls, &lslist, ls_list)
+			dlm_scan_rsbs(ls);
+		schedule_timeout_interruptible(dlm_config.scan_secs * HZ);
+	}
+	return 0;
+}
+
+static int dlm_scand_start(void)
+{
+	struct task_struct *p;
+	int error = 0;
+
+	p = kthread_run(dlm_scand, NULL, "dlm_scand");
+	if (IS_ERR(p))
+		error = PTR_ERR(p);
+	else
+		scand_task = p;
+	return error;
+}
+
+static void dlm_scand_stop(void)
+{
+	kthread_stop(scand_task);
+}
+
+static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
+{
+	struct dlm_ls *ls;
+
+	spin_lock(&lslist_lock);
+
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (ls->ls_namelen == namelen &&
+		    memcmp(ls->ls_name, name, namelen) == 0)
+			goto out;
+	}
+	ls = NULL;
+ out:
+	spin_unlock(&lslist_lock);
+	return ls;
+}
+
+struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
+{
+	struct dlm_ls *ls;
+
+	spin_lock(&lslist_lock);
+
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (ls->ls_global_id == id) {
+			ls->ls_count++;
+			goto out;
+		}
+	}
+	ls = NULL;
+ out:
+	spin_unlock(&lslist_lock);
+	return ls;
+}
+
+struct dlm_ls *dlm_find_lockspace_local(dlm_lockspace_t *lockspace)
+{
+	struct dlm_ls *ls;
+
+	spin_lock(&lslist_lock);
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (ls->ls_local_handle == lockspace) {
+			ls->ls_count++;
+			goto out;
+		}
+	}
+	ls = NULL;
+ out:
+	spin_unlock(&lslist_lock);
+	return ls;
+}
+
+struct dlm_ls *dlm_find_lockspace_device(int minor)
+{
+	struct dlm_ls *ls;
+
+	spin_lock(&lslist_lock);
+	list_for_each_entry(ls, &lslist, ls_list) {
+		if (ls->ls_device.minor == minor) {
+			ls->ls_count++;
+			goto out;
+		}
+	}
+	ls = NULL;
+ out:
+	spin_unlock(&lslist_lock);
+	return ls;
+}
+
+void dlm_put_lockspace(struct dlm_ls *ls)
+{
+	spin_lock(&lslist_lock);
+	ls->ls_count--;
+	spin_unlock(&lslist_lock);
+}
+
+static void remove_lockspace(struct dlm_ls *ls)
+{
+	for (;;) {
+		spin_lock(&lslist_lock);
+		if (ls->ls_count == 0) {
+			list_del(&ls->ls_list);
+			spin_unlock(&lslist_lock);
+			return;
+		}
+		spin_unlock(&lslist_lock);
+		ssleep(1);
+	}
+}
+
+static int threads_start(void)
+{
+	int error;
+
+	/* Thread which process lock requests for all lockspace's */
+	error = dlm_astd_start();
+	if (error) {
+		log_print("cannot start dlm_astd thread %d", error);
+		goto fail;
+	}
+
+	error = dlm_scand_start();
+	if (error) {
+		log_print("cannot start dlm_scand thread %d", error);
+		goto astd_fail;
+	}
+
+	/* Thread for sending/receiving messages for all lockspace's */
+	error = dlm_lowcomms_start();
+	if (error) {
+		log_print("cannot start dlm lowcomms %d", error);
+		goto scand_fail;
+	}
+
+	return 0;
+
+ scand_fail:
+	dlm_scand_stop();
+ astd_fail:
+	dlm_astd_stop();
+ fail:
+	return error;
+}
+
+static void threads_stop(void)
+{
+	dlm_scand_stop();
+	dlm_lowcomms_stop();
+	dlm_astd_stop();
+}
+
+static int new_lockspace(char *name, int namelen, void **lockspace,
+			 uint32_t flags, int lvblen)
+{
+	struct dlm_ls *ls;
+	int i, size, error = -ENOMEM;
+
+	if (namelen > DLM_LOCKSPACE_LEN)
+		return -EINVAL;
+
+	if (!lvblen || (lvblen % 8))
+		return -EINVAL;
+
+	if (!try_module_get(THIS_MODULE))
+		return -EINVAL;
+
+	ls = dlm_find_lockspace_name(name, namelen);
+	if (ls) {
+		*lockspace = ls;
+		module_put(THIS_MODULE);
+		return -EEXIST;
+	}
+
+	ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
+	if (!ls)
+		goto out;
+	memcpy(ls->ls_name, name, namelen);
+	ls->ls_namelen = namelen;
+	ls->ls_exflags = flags;
+	ls->ls_lvblen = lvblen;
+	ls->ls_count = 0;
+	ls->ls_flags = 0;
+
+	size = dlm_config.rsbtbl_size;
+	ls->ls_rsbtbl_size = size;
+
+	ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
+	if (!ls->ls_rsbtbl)
+		goto out_lsfree;
+	for (i = 0; i < size; i++) {
+		INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
+		INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
+		rwlock_init(&ls->ls_rsbtbl[i].lock);
+	}
+
+	size = dlm_config.lkbtbl_size;
+	ls->ls_lkbtbl_size = size;
+
+	ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
+	if (!ls->ls_lkbtbl)
+		goto out_rsbfree;
+	for (i = 0; i < size; i++) {
+		INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list);
+		rwlock_init(&ls->ls_lkbtbl[i].lock);
+		ls->ls_lkbtbl[i].counter = 1;
+	}
+
+	size = dlm_config.dirtbl_size;
+	ls->ls_dirtbl_size = size;
+
+	ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
+	if (!ls->ls_dirtbl)
+		goto out_lkbfree;
+	for (i = 0; i < size; i++) {
+		INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
+		rwlock_init(&ls->ls_dirtbl[i].lock);
+	}
+
+	INIT_LIST_HEAD(&ls->ls_waiters);
+	mutex_init(&ls->ls_waiters_mutex);
+
+	INIT_LIST_HEAD(&ls->ls_nodes);
+	INIT_LIST_HEAD(&ls->ls_nodes_gone);
+	ls->ls_num_nodes = 0;
+	ls->ls_low_nodeid = 0;
+	ls->ls_total_weight = 0;
+	ls->ls_node_array = NULL;
+
+	memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb));
+	ls->ls_stub_rsb.res_ls = ls;
+
+	ls->ls_debug_rsb_dentry = NULL;
+	ls->ls_debug_waiters_dentry = NULL;
+
+	init_waitqueue_head(&ls->ls_uevent_wait);
+	ls->ls_uevent_result = 0;
+
+	ls->ls_recoverd_task = NULL;
+	mutex_init(&ls->ls_recoverd_active);
+	spin_lock_init(&ls->ls_recover_lock);
+	ls->ls_recover_status = 0;
+	ls->ls_recover_seq = 0;
+	ls->ls_recover_args = NULL;
+	init_rwsem(&ls->ls_in_recovery);
+	INIT_LIST_HEAD(&ls->ls_requestqueue);
+	mutex_init(&ls->ls_requestqueue_mutex);
+	mutex_init(&ls->ls_clear_proc_locks);
+
+	ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
+	if (!ls->ls_recover_buf)
+		goto out_dirfree;
+
+	INIT_LIST_HEAD(&ls->ls_recover_list);
+	spin_lock_init(&ls->ls_recover_list_lock);
+	ls->ls_recover_list_count = 0;
+	ls->ls_local_handle = ls;
+	init_waitqueue_head(&ls->ls_wait_general);
+	INIT_LIST_HEAD(&ls->ls_root_list);
+	init_rwsem(&ls->ls_root_sem);
+
+	down_write(&ls->ls_in_recovery);
+
+	spin_lock(&lslist_lock);
+	list_add(&ls->ls_list, &lslist);
+	spin_unlock(&lslist_lock);
+
+	/* needs to find ls in lslist */
+	error = dlm_recoverd_start(ls);
+	if (error) {
+		log_error(ls, "can't start dlm_recoverd %d", error);
+		goto out_rcomfree;
+	}
+
+	dlm_create_debug_file(ls);
+
+	error = kobject_setup(ls);
+	if (error)
+		goto out_del;
+
+	error = kobject_register(&ls->ls_kobj);
+	if (error)
+		goto out_del;
+
+	error = do_uevent(ls, 1);
+	if (error)
+		goto out_unreg;
+
+	*lockspace = ls;
+	return 0;
+
+ out_unreg:
+	kobject_unregister(&ls->ls_kobj);
+ out_del:
+	dlm_delete_debug_file(ls);
+	dlm_recoverd_stop(ls);
+ out_rcomfree:
+	spin_lock(&lslist_lock);
+	list_del(&ls->ls_list);
+	spin_unlock(&lslist_lock);
+	kfree(ls->ls_recover_buf);
+ out_dirfree:
+	kfree(ls->ls_dirtbl);
+ out_lkbfree:
+	kfree(ls->ls_lkbtbl);
+ out_rsbfree:
+	kfree(ls->ls_rsbtbl);
+ out_lsfree:
+	kfree(ls);
+ out:
+	module_put(THIS_MODULE);
+	return error;
+}
+
+int dlm_new_lockspace(char *name, int namelen, void **lockspace,
+		      uint32_t flags, int lvblen)
+{
+	int error = 0;
+
+	mutex_lock(&ls_lock);
+	if (!ls_count)
+		error = threads_start();
+	if (error)
+		goto out;
+
+	error = new_lockspace(name, namelen, lockspace, flags, lvblen);
+	if (!error)
+		ls_count++;
+ out:
+	mutex_unlock(&ls_lock);
+	return error;
+}
+
+/* Return 1 if the lockspace still has active remote locks,
+ *        2 if the lockspace still has active local locks.
+ */
+static int lockspace_busy(struct dlm_ls *ls)
+{
+	int i, lkb_found = 0;
+	struct dlm_lkb *lkb;
+
+	/* NOTE: We check the lockidtbl here rather than the resource table.
+	   This is because there may be LKBs queued as ASTs that have been
+	   unlinked from their RSBs and are pending deletion once the AST has
+	   been delivered */
+
+	for (i = 0; i < ls->ls_lkbtbl_size; i++) {
+		read_lock(&ls->ls_lkbtbl[i].lock);
+		if (!list_empty(&ls->ls_lkbtbl[i].list)) {
+			lkb_found = 1;
+			list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list,
+					    lkb_idtbl_list) {
+				if (!lkb->lkb_nodeid) {
+					read_unlock(&ls->ls_lkbtbl[i].lock);
+					return 2;
+				}
+			}
+		}
+		read_unlock(&ls->ls_lkbtbl[i].lock);
+	}
+	return lkb_found;
+}
+
+static int release_lockspace(struct dlm_ls *ls, int force)
+{
+	struct dlm_lkb *lkb;
+	struct dlm_rsb *rsb;
+	struct list_head *head;
+	int i;
+	int busy = lockspace_busy(ls);
+
+	if (busy > force)
+		return -EBUSY;
+
+	if (force < 3)
+		do_uevent(ls, 0);
+
+	dlm_recoverd_stop(ls);
+
+	remove_lockspace(ls);
+
+	dlm_delete_debug_file(ls);
+
+	dlm_astd_suspend();
+
+	kfree(ls->ls_recover_buf);
+
+	/*
+	 * Free direntry structs.
+	 */
+
+	dlm_dir_clear(ls);
+	kfree(ls->ls_dirtbl);
+
+	/*
+	 * Free all lkb's on lkbtbl[] lists.
+	 */
+
+	for (i = 0; i < ls->ls_lkbtbl_size; i++) {
+		head = &ls->ls_lkbtbl[i].list;
+		while (!list_empty(head)) {
+			lkb = list_entry(head->next, struct dlm_lkb,
+					 lkb_idtbl_list);
+
+			list_del(&lkb->lkb_idtbl_list);
+
+			dlm_del_ast(lkb);
+
+			if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY)
+				free_lvb(lkb->lkb_lvbptr);
+
+			free_lkb(lkb);
+		}
+	}
+	dlm_astd_resume();
+
+	kfree(ls->ls_lkbtbl);
+
+	/*
+	 * Free all rsb's on rsbtbl[] lists
+	 */
+
+	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+		head = &ls->ls_rsbtbl[i].list;
+		while (!list_empty(head)) {
+			rsb = list_entry(head->next, struct dlm_rsb,
+					 res_hashchain);
+
+			list_del(&rsb->res_hashchain);
+			free_rsb(rsb);
+		}
+
+		head = &ls->ls_rsbtbl[i].toss;
+		while (!list_empty(head)) {
+			rsb = list_entry(head->next, struct dlm_rsb,
+					 res_hashchain);
+			list_del(&rsb->res_hashchain);
+			free_rsb(rsb);
+		}
+	}
+
+	kfree(ls->ls_rsbtbl);
+
+	/*
+	 * Free structures on any other lists
+	 */
+
+	kfree(ls->ls_recover_args);
+	dlm_clear_free_entries(ls);
+	dlm_clear_members(ls);
+	dlm_clear_members_gone(ls);
+	kfree(ls->ls_node_array);
+	kobject_unregister(&ls->ls_kobj);
+	kfree(ls);
+
+	mutex_lock(&ls_lock);
+	ls_count--;
+	if (!ls_count)
+		threads_stop();
+	mutex_unlock(&ls_lock);
+
+	module_put(THIS_MODULE);
+	return 0;
+}
+
+/*
+ * Called when a system has released all its locks and is not going to use the
+ * lockspace any longer.  We free everything we're managing for this lockspace.
+ * Remaining nodes will go through the recovery process as if we'd died.  The
+ * lockspace must continue to function as usual, participating in recoveries,
+ * until this returns.
+ *
+ * Force has 4 possible values:
+ * 0 - don't destroy locksapce if it has any LKBs
+ * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs
+ * 2 - destroy lockspace regardless of LKBs
+ * 3 - destroy lockspace as part of a forced shutdown
+ */
+
+int dlm_release_lockspace(void *lockspace, int force)
+{
+	struct dlm_ls *ls;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -EINVAL;
+	dlm_put_lockspace(ls);
+	return release_lockspace(ls, force);
+}
+
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
new file mode 100644
index 000000000000..891eabbdd021
--- /dev/null
+++ b/fs/dlm/lockspace.h
@@ -0,0 +1,25 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOCKSPACE_DOT_H__
+#define __LOCKSPACE_DOT_H__
+
+int dlm_lockspace_init(void);
+void dlm_lockspace_exit(void);
+struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
+struct dlm_ls *dlm_find_lockspace_local(void *id);
+struct dlm_ls *dlm_find_lockspace_device(int minor);
+void dlm_put_lockspace(struct dlm_ls *ls);
+
+#endif				/* __LOCKSPACE_DOT_H__ */
+
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
new file mode 100644
index 000000000000..6da6b14d5a61
--- /dev/null
+++ b/fs/dlm/lowcomms.c
@@ -0,0 +1,1239 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * lowcomms.c
+ *
+ * This is the "low-level" comms layer.
+ *
+ * It is responsible for sending/receiving messages
+ * from other nodes in the cluster.
+ *
+ * Cluster nodes are referred to by their nodeids. nodeids are
+ * simply 32 bit numbers to the locking module - if they need to
+ * be expanded for the cluster infrastructure then that is it's
+ * responsibility. It is this layer's
+ * responsibility to resolve these into IP address or
+ * whatever it needs for inter-node communication.
+ *
+ * The comms level is two kernel threads that deal mainly with
+ * the receiving of messages from other nodes and passing them
+ * up to the mid-level comms layer (which understands the
+ * message format) for execution by the locking core, and
+ * a send thread which does all the setting up of connections
+ * to remote nodes and the sending of data. Threads are not allowed
+ * to send their own data because it may cause them to wait in times
+ * of high load. Also, this way, the sending thread can collect together
+ * messages bound for one node and send them in one block.
+ *
+ * I don't see any problem with the recv thread executing the locking
+ * code on behalf of remote processes as the locking code is
+ * short, efficient and never (well, hardly ever) waits.
+ *
+ */
+
+#include <asm/ioctls.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/sctp/user.h>
+#include <linux/pagemap.h>
+#include <linux/socket.h>
+#include <linux/idr.h>
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "midcomms.h"
+
+static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT];
+static int			dlm_local_count;
+static int			dlm_local_nodeid;
+
+/* One of these per connected node */
+
+#define NI_INIT_PENDING 1
+#define NI_WRITE_PENDING 2
+
+struct nodeinfo {
+	spinlock_t		lock;
+	sctp_assoc_t		assoc_id;
+	unsigned long		flags;
+	struct list_head	write_list; /* nodes with pending writes */
+	struct list_head	writequeue; /* outgoing writequeue_entries */
+	spinlock_t		writequeue_lock;
+	int			nodeid;
+};
+
+static DEFINE_IDR(nodeinfo_idr);
+static struct rw_semaphore	nodeinfo_lock;
+static int			max_nodeid;
+
+struct cbuf {
+	unsigned		base;
+	unsigned		len;
+	unsigned		mask;
+};
+
+/* Just the one of these, now. But this struct keeps
+   the connection-specific variables together */
+
+#define CF_READ_PENDING 1
+
+struct connection {
+	struct socket          *sock;
+	unsigned long		flags;
+	struct page            *rx_page;
+	atomic_t		waiting_requests;
+	struct cbuf		cb;
+	int                     eagain_flag;
+};
+
+/* An entry waiting to be sent */
+
+struct writequeue_entry {
+	struct list_head	list;
+	struct page            *page;
+	int			offset;
+	int			len;
+	int			end;
+	int			users;
+	struct nodeinfo        *ni;
+};
+
+#define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0)
+#define CBUF_EMPTY(cb) ((cb)->len == 0)
+#define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1))
+#define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask)
+
+#define CBUF_INIT(cb, size) \
+do { \
+	(cb)->base = (cb)->len = 0; \
+	(cb)->mask = ((size)-1); \
+} while(0)
+
+#define CBUF_EAT(cb, n) \
+do { \
+	(cb)->len  -= (n); \
+	(cb)->base += (n); \
+	(cb)->base &= (cb)->mask; \
+} while(0)
+
+
+/* List of nodes which have writes pending */
+static struct list_head write_nodes;
+static spinlock_t write_nodes_lock;
+
+/* Maximum number of incoming messages to process before
+ * doing a schedule()
+ */
+#define MAX_RX_MSG_COUNT 25
+
+/* Manage daemons */
+static struct task_struct *recv_task;
+static struct task_struct *send_task;
+static wait_queue_head_t lowcomms_recv_wait;
+static atomic_t accepting;
+
+/* The SCTP connection */
+static struct connection sctp_con;
+
+
+static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
+{
+	struct sockaddr_storage addr;
+	int error;
+
+	if (!dlm_local_count)
+		return -1;
+
+	error = dlm_nodeid_to_addr(nodeid, &addr);
+	if (error)
+		return error;
+
+	if (dlm_local_addr[0]->ss_family == AF_INET) {
+	        struct sockaddr_in *in4  = (struct sockaddr_in *) &addr;
+		struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr;
+		ret4->sin_addr.s_addr = in4->sin_addr.s_addr;
+	} else {
+	        struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &addr;
+		struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
+		memcpy(&ret6->sin6_addr, &in6->sin6_addr,
+		       sizeof(in6->sin6_addr));
+	}
+
+	return 0;
+}
+
+static struct nodeinfo *nodeid2nodeinfo(int nodeid, gfp_t alloc)
+{
+	struct nodeinfo *ni;
+	int r;
+	int n;
+
+	down_read(&nodeinfo_lock);
+	ni = idr_find(&nodeinfo_idr, nodeid);
+	up_read(&nodeinfo_lock);
+
+	if (!ni && alloc) {
+		down_write(&nodeinfo_lock);
+
+		ni = idr_find(&nodeinfo_idr, nodeid);
+		if (ni)
+			goto out_up;
+
+		r = idr_pre_get(&nodeinfo_idr, alloc);
+		if (!r)
+			goto out_up;
+
+		ni = kmalloc(sizeof(struct nodeinfo), alloc);
+		if (!ni)
+			goto out_up;
+
+		r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n);
+		if (r) {
+			kfree(ni);
+			ni = NULL;
+			goto out_up;
+		}
+		if (n != nodeid) {
+			idr_remove(&nodeinfo_idr, n);
+			kfree(ni);
+			ni = NULL;
+			goto out_up;
+		}
+		memset(ni, 0, sizeof(struct nodeinfo));
+		spin_lock_init(&ni->lock);
+		INIT_LIST_HEAD(&ni->writequeue);
+		spin_lock_init(&ni->writequeue_lock);
+		ni->nodeid = nodeid;
+
+		if (nodeid > max_nodeid)
+			max_nodeid = nodeid;
+	out_up:
+		up_write(&nodeinfo_lock);
+	}
+
+	return ni;
+}
+
+/* Don't call this too often... */
+static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc)
+{
+	int i;
+	struct nodeinfo *ni;
+
+	for (i=1; i<=max_nodeid; i++) {
+		ni = nodeid2nodeinfo(i, 0);
+		if (ni && ni->assoc_id == assoc)
+			return ni;
+	}
+	return NULL;
+}
+
+/* Data or notification available on socket */
+static void lowcomms_data_ready(struct sock *sk, int count_unused)
+{
+	atomic_inc(&sctp_con.waiting_requests);
+	if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags))
+		return;
+
+	wake_up_interruptible(&lowcomms_recv_wait);
+}
+
+
+/* Add the port number to an IP6 or 4 sockaddr and return the address length.
+   Also padd out the struct with zeros to make comparisons meaningful */
+
+static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
+			  int *addr_len)
+{
+	struct sockaddr_in *local4_addr;
+	struct sockaddr_in6 *local6_addr;
+
+	if (!dlm_local_count)
+		return;
+
+	if (!port) {
+		if (dlm_local_addr[0]->ss_family == AF_INET) {
+			local4_addr = (struct sockaddr_in *)dlm_local_addr[0];
+			port = be16_to_cpu(local4_addr->sin_port);
+		} else {
+			local6_addr = (struct sockaddr_in6 *)dlm_local_addr[0];
+			port = be16_to_cpu(local6_addr->sin6_port);
+		}
+	}
+
+	saddr->ss_family = dlm_local_addr[0]->ss_family;
+	if (dlm_local_addr[0]->ss_family == AF_INET) {
+		struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
+		in4_addr->sin_port = cpu_to_be16(port);
+		memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero));
+		memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) -
+				      sizeof(struct sockaddr_in));
+		*addr_len = sizeof(struct sockaddr_in);
+	} else {
+		struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
+		in6_addr->sin6_port = cpu_to_be16(port);
+		memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) -
+				      sizeof(struct sockaddr_in6));
+		*addr_len = sizeof(struct sockaddr_in6);
+	}
+}
+
+/* Close the connection and tidy up */
+static void close_connection(void)
+{
+	if (sctp_con.sock) {
+		sock_release(sctp_con.sock);
+		sctp_con.sock = NULL;
+	}
+
+	if (sctp_con.rx_page) {
+		__free_page(sctp_con.rx_page);
+		sctp_con.rx_page = NULL;
+	}
+}
+
+/* We only send shutdown messages to nodes that are not part of the cluster */
+static void send_shutdown(sctp_assoc_t associd)
+{
+	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+	struct msghdr outmessage;
+	struct cmsghdr *cmsg;
+	struct sctp_sndrcvinfo *sinfo;
+	int ret;
+
+	outmessage.msg_name = NULL;
+	outmessage.msg_namelen = 0;
+	outmessage.msg_control = outcmsg;
+	outmessage.msg_controllen = sizeof(outcmsg);
+	outmessage.msg_flags = MSG_EOR;
+
+	cmsg = CMSG_FIRSTHDR(&outmessage);
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDRCV;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+	outmessage.msg_controllen = cmsg->cmsg_len;
+	sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+
+	sinfo->sinfo_flags |= MSG_EOF;
+	sinfo->sinfo_assoc_id = associd;
+
+	ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0);
+
+	if (ret != 0)
+		log_print("send EOF to node failed: %d", ret);
+}
+
+
+/* INIT failed but we don't know which node...
+   restart INIT on all pending nodes */
+static void init_failed(void)
+{
+	int i;
+	struct nodeinfo *ni;
+
+	for (i=1; i<=max_nodeid; i++) {
+		ni = nodeid2nodeinfo(i, 0);
+		if (!ni)
+			continue;
+
+		if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
+			ni->assoc_id = 0;
+			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+				spin_lock_bh(&write_nodes_lock);
+				list_add_tail(&ni->write_list, &write_nodes);
+				spin_unlock_bh(&write_nodes_lock);
+			}
+		}
+	}
+	wake_up_process(send_task);
+}
+
+/* Something happened to an association */
+static void process_sctp_notification(struct msghdr *msg, char *buf)
+{
+	union sctp_notification *sn = (union sctp_notification *)buf;
+
+	if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) {
+		switch (sn->sn_assoc_change.sac_state) {
+
+		case SCTP_COMM_UP:
+		case SCTP_RESTART:
+		{
+			/* Check that the new node is in the lockspace */
+			struct sctp_prim prim;
+			mm_segment_t fs;
+			int nodeid;
+			int prim_len, ret;
+			int addr_len;
+			struct nodeinfo *ni;
+
+			/* This seems to happen when we received a connection
+			 * too early... or something...  anyway, it happens but
+			 * we always seem to get a real message too, see
+			 * receive_from_sock */
+
+			if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) {
+				log_print("COMM_UP for invalid assoc ID %d",
+					 (int)sn->sn_assoc_change.sac_assoc_id);
+				init_failed();
+				return;
+			}
+			memset(&prim, 0, sizeof(struct sctp_prim));
+			prim_len = sizeof(struct sctp_prim);
+			prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id;
+
+			fs = get_fs();
+			set_fs(get_ds());
+			ret = sctp_con.sock->ops->getsockopt(sctp_con.sock,
+						IPPROTO_SCTP, SCTP_PRIMARY_ADDR,
+						(char*)&prim, &prim_len);
+			set_fs(fs);
+			if (ret < 0) {
+				struct nodeinfo *ni;
+
+				log_print("getsockopt/sctp_primary_addr on "
+					  "new assoc %d failed : %d",
+				    (int)sn->sn_assoc_change.sac_assoc_id, ret);
+
+				/* Retry INIT later */
+				ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
+				if (ni)
+					clear_bit(NI_INIT_PENDING, &ni->flags);
+				return;
+			}
+			make_sockaddr(&prim.ssp_addr, 0, &addr_len);
+			if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
+				log_print("reject connect from unknown addr");
+				send_shutdown(prim.ssp_assoc_id);
+				return;
+			}
+
+			ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
+			if (!ni)
+				return;
+
+			/* Save the assoc ID */
+			spin_lock(&ni->lock);
+			ni->assoc_id = sn->sn_assoc_change.sac_assoc_id;
+			spin_unlock(&ni->lock);
+
+			log_print("got new/restarted association %d nodeid %d",
+			       (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
+
+			/* Send any pending writes */
+			clear_bit(NI_INIT_PENDING, &ni->flags);
+			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+				spin_lock_bh(&write_nodes_lock);
+				list_add_tail(&ni->write_list, &write_nodes);
+				spin_unlock_bh(&write_nodes_lock);
+			}
+			wake_up_process(send_task);
+		}
+		break;
+
+		case SCTP_COMM_LOST:
+		case SCTP_SHUTDOWN_COMP:
+		{
+			struct nodeinfo *ni;
+
+			ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id);
+			if (ni) {
+				spin_lock(&ni->lock);
+				ni->assoc_id = 0;
+				spin_unlock(&ni->lock);
+			}
+		}
+		break;
+
+		/* We don't know which INIT failed, so clear the PENDING flags
+		 * on them all.  if assoc_id is zero then it will then try
+		 * again */
+
+		case SCTP_CANT_STR_ASSOC:
+		{
+			log_print("Can't start SCTP association - retrying");
+			init_failed();
+		}
+		break;
+
+		default:
+			log_print("unexpected SCTP assoc change id=%d state=%d",
+				  (int)sn->sn_assoc_change.sac_assoc_id,
+				  sn->sn_assoc_change.sac_state);
+		}
+	}
+}
+
+/* Data received from remote end */
+static int receive_from_sock(void)
+{
+	int ret = 0;
+	struct msghdr msg;
+	struct kvec iov[2];
+	unsigned len;
+	int r;
+	struct sctp_sndrcvinfo *sinfo;
+	struct cmsghdr *cmsg;
+	struct nodeinfo *ni;
+
+	/* These two are marginally too big for stack allocation, but this
+	 * function is (currently) only called by dlm_recvd so static should be
+	 * OK.
+	 */
+	static struct sockaddr_storage msgname;
+	static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+
+	if (sctp_con.sock == NULL)
+		goto out;
+
+	if (sctp_con.rx_page == NULL) {
+		/*
+		 * This doesn't need to be atomic, but I think it should
+		 * improve performance if it is.
+		 */
+		sctp_con.rx_page = alloc_page(GFP_ATOMIC);
+		if (sctp_con.rx_page == NULL)
+			goto out_resched;
+		CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE);
+	}
+
+	memset(&incmsg, 0, sizeof(incmsg));
+	memset(&msgname, 0, sizeof(msgname));
+
+	memset(incmsg, 0, sizeof(incmsg));
+	msg.msg_name = &msgname;
+	msg.msg_namelen = sizeof(msgname);
+	msg.msg_flags = 0;
+	msg.msg_control = incmsg;
+	msg.msg_controllen = sizeof(incmsg);
+	msg.msg_iovlen = 1;
+
+	/* I don't see why this circular buffer stuff is necessary for SCTP
+	 * which is a packet-based protocol, but the whole thing breaks under
+	 * load without it! The overhead is minimal (and is in the TCP lowcomms
+	 * anyway, of course) so I'll leave it in until I can figure out what's
+	 * really happening.
+	 */
+
+	/*
+	 * iov[0] is the bit of the circular buffer between the current end
+	 * point (cb.base + cb.len) and the end of the buffer.
+	 */
+	iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb);
+	iov[0].iov_base = page_address(sctp_con.rx_page) +
+			  CBUF_DATA(&sctp_con.cb);
+	iov[1].iov_len = 0;
+
+	/*
+	 * iov[1] is the bit of the circular buffer between the start of the
+	 * buffer and the start of the currently used section (cb.base)
+	 */
+	if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) {
+		iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb);
+		iov[1].iov_len = sctp_con.cb.base;
+		iov[1].iov_base = page_address(sctp_con.rx_page);
+		msg.msg_iovlen = 2;
+	}
+	len = iov[0].iov_len + iov[1].iov_len;
+
+	r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, msg.msg_iovlen, len,
+				 MSG_NOSIGNAL | MSG_DONTWAIT);
+	if (ret <= 0)
+		goto out_close;
+
+	msg.msg_control = incmsg;
+	msg.msg_controllen = sizeof(incmsg);
+	cmsg = CMSG_FIRSTHDR(&msg);
+	sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+
+	if (msg.msg_flags & MSG_NOTIFICATION) {
+		process_sctp_notification(&msg, page_address(sctp_con.rx_page));
+		return 0;
+	}
+
+	/* Is this a new association ? */
+	ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL);
+	if (ni) {
+		ni->assoc_id = sinfo->sinfo_assoc_id;
+		if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) {
+
+			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+				spin_lock_bh(&write_nodes_lock);
+				list_add_tail(&ni->write_list, &write_nodes);
+				spin_unlock_bh(&write_nodes_lock);
+			}
+			wake_up_process(send_task);
+		}
+	}
+
+	/* INIT sends a message with length of 1 - ignore it */
+	if (r == 1)
+		return 0;
+
+	CBUF_ADD(&sctp_con.cb, ret);
+	ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid),
+					  page_address(sctp_con.rx_page),
+					  sctp_con.cb.base, sctp_con.cb.len,
+					  PAGE_CACHE_SIZE);
+	if (ret < 0)
+		goto out_close;
+	CBUF_EAT(&sctp_con.cb, ret);
+
+      out:
+	ret = 0;
+	goto out_ret;
+
+      out_resched:
+	lowcomms_data_ready(sctp_con.sock->sk, 0);
+	ret = 0;
+	schedule();
+	goto out_ret;
+
+      out_close:
+	if (ret != -EAGAIN)
+		log_print("error reading from sctp socket: %d", ret);
+      out_ret:
+	return ret;
+}
+
+/* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */
+static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num)
+{
+	mm_segment_t fs;
+	int result = 0;
+
+	fs = get_fs();
+	set_fs(get_ds());
+	if (num == 1)
+		result = sctp_con.sock->ops->bind(sctp_con.sock,
+					(struct sockaddr *) addr, addr_len);
+	else
+		result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP,
+				SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len);
+	set_fs(fs);
+
+	if (result < 0)
+		log_print("Can't bind to port %d addr number %d",
+			  dlm_config.tcp_port, num);
+
+	return result;
+}
+
+static void init_local(void)
+{
+	struct sockaddr_storage sas, *addr;
+	int i;
+
+	dlm_local_nodeid = dlm_our_nodeid();
+
+	for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) {
+		if (dlm_our_addr(&sas, i))
+			break;
+
+		addr = kmalloc(sizeof(*addr), GFP_KERNEL);
+		if (!addr)
+			break;
+		memcpy(addr, &sas, sizeof(*addr));
+		dlm_local_addr[dlm_local_count++] = addr;
+	}
+}
+
+/* Initialise SCTP socket and bind to all interfaces */
+static int init_sock(void)
+{
+	mm_segment_t fs;
+	struct socket *sock = NULL;
+	struct sockaddr_storage localaddr;
+	struct sctp_event_subscribe subscribe;
+	int result = -EINVAL, num = 1, i, addr_len;
+
+	if (!dlm_local_count) {
+		init_local();
+		if (!dlm_local_count) {
+			log_print("no local IP address has been set");
+			goto out;
+		}
+	}
+
+	result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET,
+				  IPPROTO_SCTP, &sock);
+	if (result < 0) {
+		log_print("Can't create comms socket, check SCTP is loaded");
+		goto out;
+	}
+
+	/* Listen for events */
+	memset(&subscribe, 0, sizeof(subscribe));
+	subscribe.sctp_data_io_event = 1;
+	subscribe.sctp_association_event = 1;
+	subscribe.sctp_send_failure_event = 1;
+	subscribe.sctp_shutdown_event = 1;
+	subscribe.sctp_partial_delivery_event = 1;
+
+	fs = get_fs();
+	set_fs(get_ds());
+	result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS,
+				       (char *)&subscribe, sizeof(subscribe));
+	set_fs(fs);
+
+	if (result < 0) {
+		log_print("Failed to set SCTP_EVENTS on socket: result=%d",
+			  result);
+		goto create_delsock;
+	}
+
+	/* Init con struct */
+	sock->sk->sk_user_data = &sctp_con;
+	sctp_con.sock = sock;
+	sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready;
+
+	/* Bind to all interfaces. */
+	for (i = 0; i < dlm_local_count; i++) {
+		memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
+		make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
+
+		result = add_bind_addr(&localaddr, addr_len, num);
+		if (result)
+			goto create_delsock;
+		++num;
+	}
+
+	result = sock->ops->listen(sock, 5);
+	if (result < 0) {
+		log_print("Can't set socket listening");
+		goto create_delsock;
+	}
+
+	return 0;
+
+ create_delsock:
+	sock_release(sock);
+	sctp_con.sock = NULL;
+ out:
+	return result;
+}
+
+
+static struct writequeue_entry *new_writequeue_entry(gfp_t allocation)
+{
+	struct writequeue_entry *entry;
+
+	entry = kmalloc(sizeof(struct writequeue_entry), allocation);
+	if (!entry)
+		return NULL;
+
+	entry->page = alloc_page(allocation);
+	if (!entry->page) {
+		kfree(entry);
+		return NULL;
+	}
+
+	entry->offset = 0;
+	entry->len = 0;
+	entry->end = 0;
+	entry->users = 0;
+
+	return entry;
+}
+
+void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
+{
+	struct writequeue_entry *e;
+	int offset = 0;
+	int users = 0;
+	struct nodeinfo *ni;
+
+	if (!atomic_read(&accepting))
+		return NULL;
+
+	ni = nodeid2nodeinfo(nodeid, allocation);
+	if (!ni)
+		return NULL;
+
+	spin_lock(&ni->writequeue_lock);
+	e = list_entry(ni->writequeue.prev, struct writequeue_entry, list);
+	if (((struct list_head *) e == &ni->writequeue) ||
+	    (PAGE_CACHE_SIZE - e->end < len)) {
+		e = NULL;
+	} else {
+		offset = e->end;
+		e->end += len;
+		users = e->users++;
+	}
+	spin_unlock(&ni->writequeue_lock);
+
+	if (e) {
+	      got_one:
+		if (users == 0)
+			kmap(e->page);
+		*ppc = page_address(e->page) + offset;
+		return e;
+	}
+
+	e = new_writequeue_entry(allocation);
+	if (e) {
+		spin_lock(&ni->writequeue_lock);
+		offset = e->end;
+		e->end += len;
+		e->ni = ni;
+		users = e->users++;
+		list_add_tail(&e->list, &ni->writequeue);
+		spin_unlock(&ni->writequeue_lock);
+		goto got_one;
+	}
+	return NULL;
+}
+
+void dlm_lowcomms_commit_buffer(void *arg)
+{
+	struct writequeue_entry *e = (struct writequeue_entry *) arg;
+	int users;
+	struct nodeinfo *ni = e->ni;
+
+	if (!atomic_read(&accepting))
+		return;
+
+	spin_lock(&ni->writequeue_lock);
+	users = --e->users;
+	if (users)
+		goto out;
+	e->len = e->end - e->offset;
+	kunmap(e->page);
+	spin_unlock(&ni->writequeue_lock);
+
+	if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+		spin_lock_bh(&write_nodes_lock);
+		list_add_tail(&ni->write_list, &write_nodes);
+		spin_unlock_bh(&write_nodes_lock);
+		wake_up_process(send_task);
+	}
+	return;
+
+      out:
+	spin_unlock(&ni->writequeue_lock);
+	return;
+}
+
+static void free_entry(struct writequeue_entry *e)
+{
+	__free_page(e->page);
+	kfree(e);
+}
+
+/* Initiate an SCTP association. In theory we could just use sendmsg() on
+   the first IP address and it should work, but this allows us to set up the
+   association before sending any valuable data that we can't afford to lose.
+   It also keeps the send path clean as it can now always use the association ID */
+static void initiate_association(int nodeid)
+{
+	struct sockaddr_storage rem_addr;
+	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+	struct msghdr outmessage;
+	struct cmsghdr *cmsg;
+	struct sctp_sndrcvinfo *sinfo;
+	int ret;
+	int addrlen;
+	char buf[1];
+	struct kvec iov[1];
+	struct nodeinfo *ni;
+
+	log_print("Initiating association with node %d", nodeid);
+
+	ni = nodeid2nodeinfo(nodeid, GFP_KERNEL);
+	if (!ni)
+		return;
+
+	if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) {
+		log_print("no address for nodeid %d", nodeid);
+		return;
+	}
+
+	make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
+
+	outmessage.msg_name = &rem_addr;
+	outmessage.msg_namelen = addrlen;
+	outmessage.msg_control = outcmsg;
+	outmessage.msg_controllen = sizeof(outcmsg);
+	outmessage.msg_flags = MSG_EOR;
+
+	iov[0].iov_base = buf;
+	iov[0].iov_len = 1;
+
+	/* Real INIT messages seem to cause trouble. Just send a 1 byte message
+	   we can afford to lose */
+	cmsg = CMSG_FIRSTHDR(&outmessage);
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDRCV;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+	sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+	sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
+
+	outmessage.msg_controllen = cmsg->cmsg_len;
+	ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1);
+	if (ret < 0) {
+		log_print("send INIT to node failed: %d", ret);
+		/* Try again later */
+		clear_bit(NI_INIT_PENDING, &ni->flags);
+	}
+}
+
+/* Send a message */
+static int send_to_sock(struct nodeinfo *ni)
+{
+	int ret = 0;
+	struct writequeue_entry *e;
+	int len, offset;
+	struct msghdr outmsg;
+	static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+	struct cmsghdr *cmsg;
+	struct sctp_sndrcvinfo *sinfo;
+	struct kvec iov;
+
+        /* See if we need to init an association before we start
+	   sending precious messages */
+	spin_lock(&ni->lock);
+	if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
+		spin_unlock(&ni->lock);
+		initiate_association(ni->nodeid);
+		return 0;
+	}
+	spin_unlock(&ni->lock);
+
+	outmsg.msg_name = NULL; /* We use assoc_id */
+	outmsg.msg_namelen = 0;
+	outmsg.msg_control = outcmsg;
+	outmsg.msg_controllen = sizeof(outcmsg);
+	outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR;
+
+	cmsg = CMSG_FIRSTHDR(&outmsg);
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDRCV;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+	sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+	memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo));
+	sinfo->sinfo_ppid = cpu_to_le32(dlm_local_nodeid);
+	sinfo->sinfo_assoc_id = ni->assoc_id;
+	outmsg.msg_controllen = cmsg->cmsg_len;
+
+	spin_lock(&ni->writequeue_lock);
+	for (;;) {
+		if (list_empty(&ni->writequeue))
+			break;
+		e = list_entry(ni->writequeue.next, struct writequeue_entry,
+			       list);
+		len = e->len;
+		offset = e->offset;
+		BUG_ON(len == 0 && e->users == 0);
+		spin_unlock(&ni->writequeue_lock);
+		kmap(e->page);
+
+		ret = 0;
+		if (len) {
+			iov.iov_base = page_address(e->page)+offset;
+			iov.iov_len = len;
+
+			ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1,
+					     len);
+			if (ret == -EAGAIN) {
+				sctp_con.eagain_flag = 1;
+				goto out;
+			} else if (ret < 0)
+				goto send_error;
+		} else {
+			/* Don't starve people filling buffers */
+			schedule();
+		}
+
+		spin_lock(&ni->writequeue_lock);
+		e->offset += ret;
+		e->len -= ret;
+
+		if (e->len == 0 && e->users == 0) {
+			list_del(&e->list);
+			free_entry(e);
+			continue;
+		}
+	}
+	spin_unlock(&ni->writequeue_lock);
+ out:
+	return ret;
+
+ send_error:
+	log_print("Error sending to node %d %d", ni->nodeid, ret);
+	spin_lock(&ni->lock);
+	if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) {
+		ni->assoc_id = 0;
+		spin_unlock(&ni->lock);
+		initiate_association(ni->nodeid);
+	} else
+		spin_unlock(&ni->lock);
+
+	return ret;
+}
+
+/* Try to send any messages that are pending */
+static void process_output_queue(void)
+{
+	struct list_head *list;
+	struct list_head *temp;
+
+	spin_lock_bh(&write_nodes_lock);
+	list_for_each_safe(list, temp, &write_nodes) {
+		struct nodeinfo *ni =
+		    list_entry(list, struct nodeinfo, write_list);
+		clear_bit(NI_WRITE_PENDING, &ni->flags);
+		list_del(&ni->write_list);
+
+		spin_unlock_bh(&write_nodes_lock);
+
+		send_to_sock(ni);
+		spin_lock_bh(&write_nodes_lock);
+	}
+	spin_unlock_bh(&write_nodes_lock);
+}
+
+/* Called after we've had -EAGAIN and been woken up */
+static void refill_write_queue(void)
+{
+	int i;
+
+	for (i=1; i<=max_nodeid; i++) {
+		struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+
+		if (ni) {
+			if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) {
+				spin_lock_bh(&write_nodes_lock);
+				list_add_tail(&ni->write_list, &write_nodes);
+				spin_unlock_bh(&write_nodes_lock);
+			}
+		}
+	}
+}
+
+static void clean_one_writequeue(struct nodeinfo *ni)
+{
+	struct list_head *list;
+	struct list_head *temp;
+
+	spin_lock(&ni->writequeue_lock);
+	list_for_each_safe(list, temp, &ni->writequeue) {
+		struct writequeue_entry *e =
+			list_entry(list, struct writequeue_entry, list);
+		list_del(&e->list);
+		free_entry(e);
+	}
+	spin_unlock(&ni->writequeue_lock);
+}
+
+static void clean_writequeues(void)
+{
+	int i;
+
+	for (i=1; i<=max_nodeid; i++) {
+		struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+		if (ni)
+			clean_one_writequeue(ni);
+	}
+}
+
+
+static void dealloc_nodeinfo(void)
+{
+	int i;
+
+	for (i=1; i<=max_nodeid; i++) {
+		struct nodeinfo *ni = nodeid2nodeinfo(i, 0);
+		if (ni) {
+			idr_remove(&nodeinfo_idr, i);
+			kfree(ni);
+		}
+	}
+}
+
+int dlm_lowcomms_close(int nodeid)
+{
+	struct nodeinfo *ni;
+
+	ni = nodeid2nodeinfo(nodeid, 0);
+	if (!ni)
+		return -1;
+
+	spin_lock(&ni->lock);
+	if (ni->assoc_id) {
+		ni->assoc_id = 0;
+		/* Don't send shutdown here, sctp will just queue it
+		   till the node comes back up! */
+	}
+	spin_unlock(&ni->lock);
+
+	clean_one_writequeue(ni);
+	clear_bit(NI_INIT_PENDING, &ni->flags);
+	return 0;
+}
+
+static int write_list_empty(void)
+{
+	int status;
+
+	spin_lock_bh(&write_nodes_lock);
+	status = list_empty(&write_nodes);
+	spin_unlock_bh(&write_nodes_lock);
+
+	return status;
+}
+
+static int dlm_recvd(void *data)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	while (!kthread_should_stop()) {
+		int count = 0;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&lowcomms_recv_wait, &wait);
+		if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
+			schedule();
+		remove_wait_queue(&lowcomms_recv_wait, &wait);
+		set_current_state(TASK_RUNNING);
+
+		if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) {
+			int ret;
+
+			do {
+				ret = receive_from_sock();
+
+				/* Don't starve out everyone else */
+				if (++count >= MAX_RX_MSG_COUNT) {
+					schedule();
+					count = 0;
+				}
+			} while (!kthread_should_stop() && ret >=0);
+		}
+		schedule();
+	}
+
+	return 0;
+}
+
+static int dlm_sendd(void *data)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (write_list_empty())
+			schedule();
+		set_current_state(TASK_RUNNING);
+
+		if (sctp_con.eagain_flag) {
+			sctp_con.eagain_flag = 0;
+			refill_write_queue();
+		}
+		process_output_queue();
+	}
+
+	remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait);
+
+	return 0;
+}
+
+static void daemons_stop(void)
+{
+	kthread_stop(recv_task);
+	kthread_stop(send_task);
+}
+
+static int daemons_start(void)
+{
+	struct task_struct *p;
+	int error;
+
+	p = kthread_run(dlm_recvd, NULL, "dlm_recvd");
+	error = IS_ERR(p);
+       	if (error) {
+		log_print("can't start dlm_recvd %d", error);
+		return error;
+	}
+	recv_task = p;
+
+	p = kthread_run(dlm_sendd, NULL, "dlm_sendd");
+	error = IS_ERR(p);
+       	if (error) {
+		log_print("can't start dlm_sendd %d", error);
+		kthread_stop(recv_task);
+		return error;
+	}
+	send_task = p;
+
+	return 0;
+}
+
+/*
+ * This is quite likely to sleep...
+ */
+int dlm_lowcomms_start(void)
+{
+	int error;
+
+	error = init_sock();
+	if (error)
+		goto fail_sock;
+	error = daemons_start();
+	if (error)
+		goto fail_sock;
+	atomic_set(&accepting, 1);
+	return 0;
+
+ fail_sock:
+	close_connection();
+	return error;
+}
+
+/* Set all the activity flags to prevent any socket activity. */
+
+void dlm_lowcomms_stop(void)
+{
+	atomic_set(&accepting, 0);
+	sctp_con.flags = 0x7;
+	daemons_stop();
+	clean_writequeues();
+	close_connection();
+	dealloc_nodeinfo();
+	max_nodeid = 0;
+}
+
+int dlm_lowcomms_init(void)
+{
+	init_waitqueue_head(&lowcomms_recv_wait);
+	spin_lock_init(&write_nodes_lock);
+	INIT_LIST_HEAD(&write_nodes);
+	init_rwsem(&nodeinfo_lock);
+	return 0;
+}
+
+void dlm_lowcomms_exit(void)
+{
+	int i;
+
+	for (i = 0; i < dlm_local_count; i++)
+		kfree(dlm_local_addr[i]);
+	dlm_local_count = 0;
+	dlm_local_nodeid = 0;
+}
+
diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h
new file mode 100644
index 000000000000..2d045e0daae1
--- /dev/null
+++ b/fs/dlm/lowcomms.h
@@ -0,0 +1,26 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LOWCOMMS_DOT_H__
+#define __LOWCOMMS_DOT_H__
+
+int dlm_lowcomms_init(void);
+void dlm_lowcomms_exit(void);
+int dlm_lowcomms_start(void);
+void dlm_lowcomms_stop(void);
+int dlm_lowcomms_close(int nodeid);
+void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc);
+void dlm_lowcomms_commit_buffer(void *mh);
+
+#endif				/* __LOWCOMMS_DOT_H__ */
+
diff --git a/fs/dlm/lvb_table.h b/fs/dlm/lvb_table.h
new file mode 100644
index 000000000000..cc3e92f3feef
--- /dev/null
+++ b/fs/dlm/lvb_table.h
@@ -0,0 +1,18 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __LVB_TABLE_DOT_H__
+#define __LVB_TABLE_DOT_H__
+
+extern const int dlm_lvb_operations[8][8];
+
+#endif
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
new file mode 100644
index 000000000000..a8da8dc36b2e
--- /dev/null
+++ b/fs/dlm/main.c
@@ -0,0 +1,97 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "lock.h"
+#include "user.h"
+#include "memory.h"
+#include "lowcomms.h"
+#include "config.h"
+
+#ifdef CONFIG_DLM_DEBUG
+int dlm_register_debugfs(void);
+void dlm_unregister_debugfs(void);
+#else
+static inline int dlm_register_debugfs(void) { return 0; }
+static inline void dlm_unregister_debugfs(void) { }
+#endif
+
+static int __init init_dlm(void)
+{
+	int error;
+
+	error = dlm_memory_init();
+	if (error)
+		goto out;
+
+	error = dlm_lockspace_init();
+	if (error)
+		goto out_mem;
+
+	error = dlm_config_init();
+	if (error)
+		goto out_lockspace;
+
+	error = dlm_register_debugfs();
+	if (error)
+		goto out_config;
+
+	error = dlm_lowcomms_init();
+	if (error)
+		goto out_debug;
+
+	error = dlm_user_init();
+	if (error)
+		goto out_lowcomms;
+
+	printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
+
+	return 0;
+
+ out_lowcomms:
+	dlm_lowcomms_exit();
+ out_debug:
+	dlm_unregister_debugfs();
+ out_config:
+	dlm_config_exit();
+ out_lockspace:
+	dlm_lockspace_exit();
+ out_mem:
+	dlm_memory_exit();
+ out:
+	return error;
+}
+
+static void __exit exit_dlm(void)
+{
+	dlm_user_exit();
+	dlm_lowcomms_exit();
+	dlm_config_exit();
+	dlm_memory_exit();
+	dlm_lockspace_exit();
+	dlm_unregister_debugfs();
+}
+
+module_init(init_dlm);
+module_exit(exit_dlm);
+
+MODULE_DESCRIPTION("Distributed Lock Manager");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+EXPORT_SYMBOL_GPL(dlm_new_lockspace);
+EXPORT_SYMBOL_GPL(dlm_release_lockspace);
+EXPORT_SYMBOL_GPL(dlm_lock);
+EXPORT_SYMBOL_GPL(dlm_unlock);
+
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
new file mode 100644
index 000000000000..a3f7de7f3a8f
--- /dev/null
+++ b/fs/dlm/member.c
@@ -0,0 +1,327 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "recoverd.h"
+#include "recover.h"
+#include "rcom.h"
+#include "config.h"
+
+/*
+ * Following called by dlm_recoverd thread
+ */
+
+static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
+{
+	struct dlm_member *memb = NULL;
+	struct list_head *tmp;
+	struct list_head *newlist = &new->list;
+	struct list_head *head = &ls->ls_nodes;
+
+	list_for_each(tmp, head) {
+		memb = list_entry(tmp, struct dlm_member, list);
+		if (new->nodeid < memb->nodeid)
+			break;
+	}
+
+	if (!memb)
+		list_add_tail(newlist, head);
+	else {
+		/* FIXME: can use list macro here */
+		newlist->prev = tmp->prev;
+		newlist->next = tmp;
+		tmp->prev->next = newlist;
+		tmp->prev = newlist;
+	}
+}
+
+static int dlm_add_member(struct dlm_ls *ls, int nodeid)
+{
+	struct dlm_member *memb;
+	int w;
+
+	memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
+	if (!memb)
+		return -ENOMEM;
+
+	w = dlm_node_weight(ls->ls_name, nodeid);
+	if (w < 0)
+		return w;
+
+	memb->nodeid = nodeid;
+	memb->weight = w;
+	add_ordered_member(ls, memb);
+	ls->ls_num_nodes++;
+	return 0;
+}
+
+static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb)
+{
+	list_move(&memb->list, &ls->ls_nodes_gone);
+	ls->ls_num_nodes--;
+}
+
+static int dlm_is_member(struct dlm_ls *ls, int nodeid)
+{
+	struct dlm_member *memb;
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (memb->nodeid == nodeid)
+			return 1;
+	}
+	return 0;
+}
+
+int dlm_is_removed(struct dlm_ls *ls, int nodeid)
+{
+	struct dlm_member *memb;
+
+	list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
+		if (memb->nodeid == nodeid)
+			return 1;
+	}
+	return 0;
+}
+
+static void clear_memb_list(struct list_head *head)
+{
+	struct dlm_member *memb;
+
+	while (!list_empty(head)) {
+		memb = list_entry(head->next, struct dlm_member, list);
+		list_del(&memb->list);
+		kfree(memb);
+	}
+}
+
+void dlm_clear_members(struct dlm_ls *ls)
+{
+	clear_memb_list(&ls->ls_nodes);
+	ls->ls_num_nodes = 0;
+}
+
+void dlm_clear_members_gone(struct dlm_ls *ls)
+{
+	clear_memb_list(&ls->ls_nodes_gone);
+}
+
+static void make_member_array(struct dlm_ls *ls)
+{
+	struct dlm_member *memb;
+	int i, w, x = 0, total = 0, all_zero = 0, *array;
+
+	kfree(ls->ls_node_array);
+	ls->ls_node_array = NULL;
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (memb->weight)
+			total += memb->weight;
+	}
+
+	/* all nodes revert to weight of 1 if all have weight 0 */
+
+	if (!total) {
+		total = ls->ls_num_nodes;
+		all_zero = 1;
+	}
+
+	ls->ls_total_weight = total;
+
+	array = kmalloc(sizeof(int) * total, GFP_KERNEL);
+	if (!array)
+		return;
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (!all_zero && !memb->weight)
+			continue;
+
+		if (all_zero)
+			w = 1;
+		else
+			w = memb->weight;
+
+		DLM_ASSERT(x < total, printk("total %d x %d\n", total, x););
+
+		for (i = 0; i < w; i++)
+			array[x++] = memb->nodeid;
+	}
+
+	ls->ls_node_array = array;
+}
+
+/* send a status request to all members just to establish comms connections */
+
+static int ping_members(struct dlm_ls *ls)
+{
+	struct dlm_member *memb;
+	int error = 0;
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		error = dlm_recovery_stopped(ls);
+		if (error)
+			break;
+		error = dlm_rcom_status(ls, memb->nodeid);
+		if (error)
+			break;
+	}
+	if (error)
+		log_debug(ls, "ping_members aborted %d last nodeid %d",
+			  error, ls->ls_recover_nodeid);
+	return error;
+}
+
+int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
+{
+	struct dlm_member *memb, *safe;
+	int i, error, found, pos = 0, neg = 0, low = -1;
+
+	/* move departed members from ls_nodes to ls_nodes_gone */
+
+	list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
+		found = 0;
+		for (i = 0; i < rv->node_count; i++) {
+			if (memb->nodeid == rv->nodeids[i]) {
+				found = 1;
+				break;
+			}
+		}
+
+		if (!found) {
+			neg++;
+			dlm_remove_member(ls, memb);
+			log_debug(ls, "remove member %d", memb->nodeid);
+		}
+	}
+
+	/* add new members to ls_nodes */
+
+	for (i = 0; i < rv->node_count; i++) {
+		if (dlm_is_member(ls, rv->nodeids[i]))
+			continue;
+		dlm_add_member(ls, rv->nodeids[i]);
+		pos++;
+		log_debug(ls, "add member %d", rv->nodeids[i]);
+	}
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		if (low == -1 || memb->nodeid < low)
+			low = memb->nodeid;
+	}
+	ls->ls_low_nodeid = low;
+
+	make_member_array(ls);
+	dlm_set_recover_status(ls, DLM_RS_NODES);
+	*neg_out = neg;
+
+	error = ping_members(ls);
+	if (error)
+		goto out;
+
+	error = dlm_recover_members_wait(ls);
+ out:
+	log_debug(ls, "total members %d error %d", ls->ls_num_nodes, error);
+	return error;
+}
+
+/*
+ * Following called from lockspace.c
+ */
+
+int dlm_ls_stop(struct dlm_ls *ls)
+{
+	int new;
+
+	/*
+	 * A stop cancels any recovery that's in progress (see RECOVERY_STOP,
+	 * dlm_recovery_stopped()) and prevents any new locks from being
+	 * processed (see RUNNING, dlm_locking_stopped()).
+	 */
+
+	spin_lock(&ls->ls_recover_lock);
+	set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+	new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags);
+	ls->ls_recover_seq++;
+	spin_unlock(&ls->ls_recover_lock);
+
+	/*
+	 * This in_recovery lock does two things:
+	 *
+	 * 1) Keeps this function from returning until all threads are out
+	 *    of locking routines and locking is truely stopped.
+	 * 2) Keeps any new requests from being processed until it's unlocked
+	 *    when recovery is complete.
+	 */
+
+	if (new)
+		down_write(&ls->ls_in_recovery);
+
+	/*
+	 * The recoverd suspend/resume makes sure that dlm_recoverd (if
+	 * running) has noticed the clearing of RUNNING above and quit
+	 * processing the previous recovery.  This will be true for all nodes
+	 * before any nodes start the new recovery.
+	 */
+
+	dlm_recoverd_suspend(ls);
+	ls->ls_recover_status = 0;
+	dlm_recoverd_resume(ls);
+	return 0;
+}
+
+int dlm_ls_start(struct dlm_ls *ls)
+{
+	struct dlm_recover *rv = NULL, *rv_old;
+	int *ids = NULL;
+	int error, count;
+
+	rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL);
+	if (!rv)
+		return -ENOMEM;
+
+	error = count = dlm_nodeid_list(ls->ls_name, &ids);
+	if (error <= 0)
+		goto fail;
+
+	spin_lock(&ls->ls_recover_lock);
+
+	/* the lockspace needs to be stopped before it can be started */
+
+	if (!dlm_locking_stopped(ls)) {
+		spin_unlock(&ls->ls_recover_lock);
+		log_error(ls, "start ignored: lockspace running");
+		error = -EINVAL;
+		goto fail;
+	}
+
+	rv->nodeids = ids;
+	rv->node_count = count;
+	rv->seq = ++ls->ls_recover_seq;
+	rv_old = ls->ls_recover_args;
+	ls->ls_recover_args = rv;
+	spin_unlock(&ls->ls_recover_lock);
+
+	if (rv_old) {
+		kfree(rv_old->nodeids);
+		kfree(rv_old);
+	}
+
+	dlm_recoverd_kick(ls);
+	return 0;
+
+ fail:
+	kfree(rv);
+	kfree(ids);
+	return error;
+}
+
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
new file mode 100644
index 000000000000..927c08c19214
--- /dev/null
+++ b/fs/dlm/member.h
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MEMBER_DOT_H__
+#define __MEMBER_DOT_H__
+
+int dlm_ls_stop(struct dlm_ls *ls);
+int dlm_ls_start(struct dlm_ls *ls);
+void dlm_clear_members(struct dlm_ls *ls);
+void dlm_clear_members_gone(struct dlm_ls *ls);
+int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
+int dlm_is_removed(struct dlm_ls *ls, int nodeid);
+
+#endif                          /* __MEMBER_DOT_H__ */
+
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
new file mode 100644
index 000000000000..989b608fd836
--- /dev/null
+++ b/fs/dlm/memory.c
@@ -0,0 +1,116 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "config.h"
+#include "memory.h"
+
+static kmem_cache_t *lkb_cache;
+
+
+int dlm_memory_init(void)
+{
+	int ret = 0;
+
+	lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb),
+				__alignof__(struct dlm_lkb), 0, NULL, NULL);
+	if (!lkb_cache)
+		ret = -ENOMEM;
+	return ret;
+}
+
+void dlm_memory_exit(void)
+{
+	if (lkb_cache)
+		kmem_cache_destroy(lkb_cache);
+}
+
+char *allocate_lvb(struct dlm_ls *ls)
+{
+	char *p;
+
+	p = kmalloc(ls->ls_lvblen, GFP_KERNEL);
+	if (p)
+		memset(p, 0, ls->ls_lvblen);
+	return p;
+}
+
+void free_lvb(char *p)
+{
+	kfree(p);
+}
+
+/* FIXME: have some minimal space built-in to rsb for the name and
+   kmalloc a separate name if needed, like dentries are done */
+
+struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen)
+{
+	struct dlm_rsb *r;
+
+	DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
+
+	r = kmalloc(sizeof(*r) + namelen, GFP_KERNEL);
+	if (r)
+		memset(r, 0, sizeof(*r) + namelen);
+	return r;
+}
+
+void free_rsb(struct dlm_rsb *r)
+{
+	if (r->res_lvbptr)
+		free_lvb(r->res_lvbptr);
+	kfree(r);
+}
+
+struct dlm_lkb *allocate_lkb(struct dlm_ls *ls)
+{
+	struct dlm_lkb *lkb;
+
+	lkb = kmem_cache_alloc(lkb_cache, GFP_KERNEL);
+	if (lkb)
+		memset(lkb, 0, sizeof(*lkb));
+	return lkb;
+}
+
+void free_lkb(struct dlm_lkb *lkb)
+{
+	if (lkb->lkb_flags & DLM_IFL_USER) {
+		struct dlm_user_args *ua;
+		ua = (struct dlm_user_args *)lkb->lkb_astparam;
+		if (ua) {
+			if (ua->lksb.sb_lvbptr)
+				kfree(ua->lksb.sb_lvbptr);
+			kfree(ua);
+		}
+	}
+	kmem_cache_free(lkb_cache, lkb);
+}
+
+struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen)
+{
+	struct dlm_direntry *de;
+
+	DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,
+		   printk("namelen = %d\n", namelen););
+
+	de = kmalloc(sizeof(*de) + namelen, GFP_KERNEL);
+	if (de)
+		memset(de, 0, sizeof(*de) + namelen);
+	return de;
+}
+
+void free_direntry(struct dlm_direntry *de)
+{
+	kfree(de);
+}
+
diff --git a/fs/dlm/memory.h b/fs/dlm/memory.h
new file mode 100644
index 000000000000..6ead158ccc5c
--- /dev/null
+++ b/fs/dlm/memory.h
@@ -0,0 +1,29 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MEMORY_DOT_H__
+#define __MEMORY_DOT_H__
+
+int dlm_memory_init(void);
+void dlm_memory_exit(void);
+struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen);
+void free_rsb(struct dlm_rsb *r);
+struct dlm_lkb *allocate_lkb(struct dlm_ls *ls);
+void free_lkb(struct dlm_lkb *l);
+struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen);
+void free_direntry(struct dlm_direntry *de);
+char *allocate_lvb(struct dlm_ls *ls);
+void free_lvb(char *l);
+
+#endif		/* __MEMORY_DOT_H__ */
+
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
new file mode 100644
index 000000000000..c9b1c3d535f4
--- /dev/null
+++ b/fs/dlm/midcomms.c
@@ -0,0 +1,140 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+/*
+ * midcomms.c
+ *
+ * This is the appallingly named "mid-level" comms layer.
+ *
+ * Its purpose is to take packets from the "real" comms layer,
+ * split them up into packets and pass them to the interested
+ * part of the locking mechanism.
+ *
+ * It also takes messages from the locking layer, formats them
+ * into packets and sends them to the comms layer.
+ */
+
+#include "dlm_internal.h"
+#include "lowcomms.h"
+#include "config.h"
+#include "rcom.h"
+#include "lock.h"
+#include "midcomms.h"
+
+
+static void copy_from_cb(void *dst, const void *base, unsigned offset,
+			 unsigned len, unsigned limit)
+{
+	unsigned copy = len;
+
+	if ((copy + offset) > limit)
+		copy = limit - offset;
+	memcpy(dst, base + offset, copy);
+	len -= copy;
+	if (len)
+		memcpy(dst + copy, base, len);
+}
+
+/*
+ * Called from the low-level comms layer to process a buffer of
+ * commands.
+ *
+ * Only complete messages are processed here, any "spare" bytes from
+ * the end of a buffer are saved and tacked onto the front of the next
+ * message that comes in. I doubt this will happen very often but we
+ * need to be able to cope with it and I don't want the task to be waiting
+ * for packets to come in when there is useful work to be done.
+ */
+
+int dlm_process_incoming_buffer(int nodeid, const void *base,
+				unsigned offset, unsigned len, unsigned limit)
+{
+	unsigned char __tmp[DLM_INBUF_LEN];
+	struct dlm_header *msg = (struct dlm_header *) __tmp;
+	int ret = 0;
+	int err = 0;
+	uint16_t msglen;
+	uint32_t lockspace;
+
+	while (len > sizeof(struct dlm_header)) {
+
+		/* Copy just the header to check the total length.  The
+		   message may wrap around the end of the buffer back to the
+		   start, so we need to use a temp buffer and copy_from_cb. */
+
+		copy_from_cb(msg, base, offset, sizeof(struct dlm_header),
+			     limit);
+
+		msglen = le16_to_cpu(msg->h_length);
+		lockspace = msg->h_lockspace;
+
+		err = -EINVAL;
+		if (msglen < sizeof(struct dlm_header))
+			break;
+		err = -E2BIG;
+		if (msglen > dlm_config.buffer_size) {
+			log_print("message size %d from %d too big, buf len %d",
+				  msglen, nodeid, len);
+			break;
+		}
+		err = 0;
+
+		/* If only part of the full message is contained in this
+		   buffer, then do nothing and wait for lowcomms to call
+		   us again later with more data.  We return 0 meaning
+		   we've consumed none of the input buffer. */
+
+		if (msglen > len)
+			break;
+
+		/* Allocate a larger temp buffer if the full message won't fit
+		   in the buffer on the stack (which should work for most
+		   ordinary messages). */
+
+		if (msglen > sizeof(__tmp) &&
+		    msg == (struct dlm_header *) __tmp) {
+			msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
+			if (msg == NULL)
+				return ret;
+		}
+
+		copy_from_cb(msg, base, offset, msglen, limit);
+
+		BUG_ON(lockspace != msg->h_lockspace);
+
+		ret += msglen;
+		offset += msglen;
+		offset &= (limit - 1);
+		len -= msglen;
+
+		switch (msg->h_cmd) {
+		case DLM_MSG:
+			dlm_receive_message(msg, nodeid, 0);
+			break;
+
+		case DLM_RCOM:
+			dlm_receive_rcom(msg, nodeid);
+			break;
+
+		default:
+			log_print("unknown msg type %x from %u: %u %u %u %u",
+				  msg->h_cmd, nodeid, msglen, len, offset, ret);
+		}
+	}
+
+	if (msg != (struct dlm_header *) __tmp)
+		kfree(msg);
+
+	return err ? err : ret;
+}
+
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
new file mode 100644
index 000000000000..95852a5f111d
--- /dev/null
+++ b/fs/dlm/midcomms.h
@@ -0,0 +1,21 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __MIDCOMMS_DOT_H__
+#define __MIDCOMMS_DOT_H__
+
+int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
+				unsigned len, unsigned limit);
+
+#endif				/* __MIDCOMMS_DOT_H__ */
+
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
new file mode 100644
index 000000000000..518239a8b1e9
--- /dev/null
+++ b/fs/dlm/rcom.c
@@ -0,0 +1,472 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "lowcomms.h"
+#include "midcomms.h"
+#include "rcom.h"
+#include "recover.h"
+#include "dir.h"
+#include "config.h"
+#include "memory.h"
+#include "lock.h"
+#include "util.h"
+
+
+static int rcom_response(struct dlm_ls *ls)
+{
+	return test_bit(LSFL_RCOM_READY, &ls->ls_flags);
+}
+
+static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
+		       struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	char *mb;
+	int mb_len = sizeof(struct dlm_rcom) + len;
+
+	mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb);
+	if (!mh) {
+		log_print("create_rcom to %d type %d len %d ENOBUFS",
+			  to_nodeid, type, len);
+		return -ENOBUFS;
+	}
+	memset(mb, 0, mb_len);
+
+	rc = (struct dlm_rcom *) mb;
+
+	rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+	rc->rc_header.h_lockspace = ls->ls_global_id;
+	rc->rc_header.h_nodeid = dlm_our_nodeid();
+	rc->rc_header.h_length = mb_len;
+	rc->rc_header.h_cmd = DLM_RCOM;
+
+	rc->rc_type = type;
+
+	*mh_ret = mh;
+	*rc_ret = rc;
+	return 0;
+}
+
+static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
+		      struct dlm_rcom *rc)
+{
+	dlm_rcom_out(rc);
+	dlm_lowcomms_commit_buffer(mh);
+}
+
+/* When replying to a status request, a node also sends back its
+   configuration values.  The requesting node then checks that the remote
+   node is configured the same way as itself. */
+
+static void make_config(struct dlm_ls *ls, struct rcom_config *rf)
+{
+	rf->rf_lvblen = ls->ls_lvblen;
+	rf->rf_lsflags = ls->ls_exflags;
+}
+
+static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
+{
+	if (rf->rf_lvblen != ls->ls_lvblen ||
+	    rf->rf_lsflags != ls->ls_exflags) {
+		log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
+			  ls->ls_lvblen, ls->ls_exflags,
+			  nodeid, rf->rf_lvblen, rf->rf_lsflags);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	int error = 0;
+
+	memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
+	ls->ls_recover_nodeid = nodeid;
+
+	if (nodeid == dlm_our_nodeid()) {
+		rc = (struct dlm_rcom *) ls->ls_recover_buf;
+		rc->rc_result = dlm_recover_status(ls);
+		goto out;
+	}
+
+	error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh);
+	if (error)
+		goto out;
+	rc->rc_id = ++ls->ls_rcom_seq;
+
+	send_rcom(ls, mh, rc);
+
+	error = dlm_wait_function(ls, &rcom_response);
+	clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
+	if (error)
+		goto out;
+
+	rc = (struct dlm_rcom *) ls->ls_recover_buf;
+
+	if (rc->rc_result == -ESRCH) {
+		/* we pretend the remote lockspace exists with 0 status */
+		log_debug(ls, "remote node %d not ready", nodeid);
+		rc->rc_result = 0;
+	} else
+		error = check_config(ls, (struct rcom_config *) rc->rc_buf,
+				     nodeid);
+	/* the caller looks at rc_result for the remote recovery status */
+ out:
+	return error;
+}
+
+static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	int error, nodeid = rc_in->rc_header.h_nodeid;
+
+	error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
+			    sizeof(struct rcom_config), &rc, &mh);
+	if (error)
+		return;
+	rc->rc_id = rc_in->rc_id;
+	rc->rc_result = dlm_recover_status(ls);
+	make_config(ls, (struct rcom_config *) rc->rc_buf);
+
+	send_rcom(ls, mh, rc);
+}
+
+static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	if (rc_in->rc_id != ls->ls_rcom_seq) {
+		log_debug(ls, "reject old reply %d got %llx wanted %llx",
+			  rc_in->rc_type, rc_in->rc_id, ls->ls_rcom_seq);
+		return;
+	}
+	memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length);
+	set_bit(LSFL_RCOM_READY, &ls->ls_flags);
+	wake_up(&ls->ls_wait_general);
+}
+
+static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	receive_sync_reply(ls, rc_in);
+}
+
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	int error = 0, len = sizeof(struct dlm_rcom);
+
+	memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
+	ls->ls_recover_nodeid = nodeid;
+
+	if (nodeid == dlm_our_nodeid()) {
+		dlm_copy_master_names(ls, last_name, last_len,
+		                      ls->ls_recover_buf + len,
+		                      dlm_config.buffer_size - len, nodeid);
+		goto out;
+	}
+
+	error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh);
+	if (error)
+		goto out;
+	memcpy(rc->rc_buf, last_name, last_len);
+	rc->rc_id = ++ls->ls_rcom_seq;
+
+	send_rcom(ls, mh, rc);
+
+	error = dlm_wait_function(ls, &rcom_response);
+	clear_bit(LSFL_RCOM_READY, &ls->ls_flags);
+ out:
+	return error;
+}
+
+static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	int error, inlen, outlen;
+	int nodeid = rc_in->rc_header.h_nodeid;
+	uint32_t status = dlm_recover_status(ls);
+
+	/*
+	 * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
+	 * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
+	 * It could only happen in rare cases where we get a late NAMES
+	 * message from a previous instance of recovery.
+	 */
+
+	if (!(status & DLM_RS_NODES)) {
+		log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
+		return;
+	}
+
+	nodeid = rc_in->rc_header.h_nodeid;
+	inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+	outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
+
+	error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
+	if (error)
+		return;
+	rc->rc_id = rc_in->rc_id;
+
+	dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
+			      nodeid);
+	send_rcom(ls, mh, rc);
+}
+
+static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	receive_sync_reply(ls, rc_in);
+}
+
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	struct dlm_ls *ls = r->res_ls;
+	int error;
+
+	error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
+			    &rc, &mh);
+	if (error)
+		goto out;
+	memcpy(rc->rc_buf, r->res_name, r->res_length);
+	rc->rc_id = (unsigned long) r;
+
+	send_rcom(ls, mh, rc);
+ out:
+	return error;
+}
+
+static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid;
+	int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
+
+	error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
+	if (error)
+		return;
+
+	error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid);
+	if (error)
+		ret_nodeid = error;
+	rc->rc_result = ret_nodeid;
+	rc->rc_id = rc_in->rc_id;
+
+	send_rcom(ls, mh, rc);
+}
+
+static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	dlm_recover_master_reply(ls, rc_in);
+}
+
+static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
+			   struct rcom_lock *rl)
+{
+	memset(rl, 0, sizeof(*rl));
+
+	rl->rl_ownpid = lkb->lkb_ownpid;
+	rl->rl_lkid = lkb->lkb_id;
+	rl->rl_exflags = lkb->lkb_exflags;
+	rl->rl_flags = lkb->lkb_flags;
+	rl->rl_lvbseq = lkb->lkb_lvbseq;
+	rl->rl_rqmode = lkb->lkb_rqmode;
+	rl->rl_grmode = lkb->lkb_grmode;
+	rl->rl_status = lkb->lkb_status;
+	rl->rl_wait_type = lkb->lkb_wait_type;
+
+	if (lkb->lkb_bastaddr)
+		rl->rl_asts |= AST_BAST;
+	if (lkb->lkb_astaddr)
+		rl->rl_asts |= AST_COMP;
+
+	rl->rl_namelen = r->res_length;
+	memcpy(rl->rl_name, r->res_name, r->res_length);
+
+	/* FIXME: might we have an lvb without DLM_LKF_VALBLK set ?
+	   If so, receive_rcom_lock_args() won't take this copy. */
+
+	if (lkb->lkb_lvbptr)
+		memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
+}
+
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+{
+	struct dlm_ls *ls = r->res_ls;
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	struct rcom_lock *rl;
+	int error, len = sizeof(struct rcom_lock);
+
+	if (lkb->lkb_lvbptr)
+		len += ls->ls_lvblen;
+
+	error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
+	if (error)
+		goto out;
+
+	rl = (struct rcom_lock *) rc->rc_buf;
+	pack_rcom_lock(r, lkb, rl);
+	rc->rc_id = (unsigned long) r;
+
+	send_rcom(ls, mh, rc);
+ out:
+	return error;
+}
+
+static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	int error, nodeid = rc_in->rc_header.h_nodeid;
+
+	dlm_recover_master_copy(ls, rc_in);
+
+	error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
+			    sizeof(struct rcom_lock), &rc, &mh);
+	if (error)
+		return;
+
+	/* We send back the same rcom_lock struct we received, but
+	   dlm_recover_master_copy() has filled in rl_remid and rl_result */
+
+	memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
+	rc->rc_id = rc_in->rc_id;
+
+	send_rcom(ls, mh, rc);
+}
+
+static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+{
+	uint32_t status = dlm_recover_status(ls);
+
+	if (!(status & DLM_RS_DIR)) {
+		log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
+			  rc_in->rc_header.h_nodeid);
+		return;
+	}
+
+	dlm_recover_process_copy(ls, rc_in);
+}
+
+static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
+{
+	struct dlm_rcom *rc;
+	struct dlm_mhandle *mh;
+	char *mb;
+	int mb_len = sizeof(struct dlm_rcom);
+
+	mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb);
+	if (!mh)
+		return -ENOBUFS;
+	memset(mb, 0, mb_len);
+
+	rc = (struct dlm_rcom *) mb;
+
+	rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
+	rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace;
+	rc->rc_header.h_nodeid = dlm_our_nodeid();
+	rc->rc_header.h_length = mb_len;
+	rc->rc_header.h_cmd = DLM_RCOM;
+
+	rc->rc_type = DLM_RCOM_STATUS_REPLY;
+	rc->rc_id = rc_in->rc_id;
+	rc->rc_result = -ESRCH;
+
+	dlm_rcom_out(rc);
+	dlm_lowcomms_commit_buffer(mh);
+
+	return 0;
+}
+
+/* Called by dlm_recvd; corresponds to dlm_receive_message() but special
+   recovery-only comms are sent through here. */
+
+void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
+{
+	struct dlm_rcom *rc = (struct dlm_rcom *) hd;
+	struct dlm_ls *ls;
+
+	dlm_rcom_in(rc);
+
+	/* If the lockspace doesn't exist then still send a status message
+	   back; it's possible that it just doesn't have its global_id yet. */
+
+	ls = dlm_find_lockspace_global(hd->h_lockspace);
+	if (!ls) {
+		log_print("lockspace %x from %d not found",
+			  hd->h_lockspace, nodeid);
+		send_ls_not_ready(nodeid, rc);
+		return;
+	}
+
+	if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
+		log_error(ls, "ignoring recovery message %x from %d",
+			  rc->rc_type, nodeid);
+		goto out;
+	}
+
+	if (nodeid != rc->rc_header.h_nodeid) {
+		log_error(ls, "bad rcom nodeid %d from %d",
+			  rc->rc_header.h_nodeid, nodeid);
+		goto out;
+	}
+
+	switch (rc->rc_type) {
+	case DLM_RCOM_STATUS:
+		receive_rcom_status(ls, rc);
+		break;
+
+	case DLM_RCOM_NAMES:
+		receive_rcom_names(ls, rc);
+		break;
+
+	case DLM_RCOM_LOOKUP:
+		receive_rcom_lookup(ls, rc);
+		break;
+
+	case DLM_RCOM_LOCK:
+		receive_rcom_lock(ls, rc);
+		break;
+
+	case DLM_RCOM_STATUS_REPLY:
+		receive_rcom_status_reply(ls, rc);
+		break;
+
+	case DLM_RCOM_NAMES_REPLY:
+		receive_rcom_names_reply(ls, rc);
+		break;
+
+	case DLM_RCOM_LOOKUP_REPLY:
+		receive_rcom_lookup_reply(ls, rc);
+		break;
+
+	case DLM_RCOM_LOCK_REPLY:
+		receive_rcom_lock_reply(ls, rc);
+		break;
+
+	default:
+		DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
+	}
+ out:
+	dlm_put_lockspace(ls);
+}
+
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
new file mode 100644
index 000000000000..d7984321ff41
--- /dev/null
+++ b/fs/dlm/rcom.h
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RCOM_DOT_H__
+#define __RCOM_DOT_H__
+
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
+void dlm_receive_rcom(struct dlm_header *hd, int nodeid);
+
+#endif
+
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
new file mode 100644
index 000000000000..a5e6d184872e
--- /dev/null
+++ b/fs/dlm/recover.c
@@ -0,0 +1,765 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "dir.h"
+#include "config.h"
+#include "ast.h"
+#include "memory.h"
+#include "rcom.h"
+#include "lock.h"
+#include "lowcomms.h"
+#include "member.h"
+#include "recover.h"
+
+
+/*
+ * Recovery waiting routines: these functions wait for a particular reply from
+ * a remote node, or for the remote node to report a certain status.  They need
+ * to abort if the lockspace is stopped indicating a node has failed (perhaps
+ * the one being waited for).
+ */
+
+/*
+ * Wait until given function returns non-zero or lockspace is stopped
+ * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes).  When another
+ * function thinks it could have completed the waited-on task, they should wake
+ * up ls_wait_general to get an immediate response rather than waiting for the
+ * timer to detect the result.  A timer wakes us up periodically while waiting
+ * to see if we should abort due to a node failure.  This should only be called
+ * by the dlm_recoverd thread.
+ */
+
+static void dlm_wait_timer_fn(unsigned long data)
+{
+	struct dlm_ls *ls = (struct dlm_ls *) data;
+	mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ));
+	wake_up(&ls->ls_wait_general);
+}
+
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls))
+{
+	int error = 0;
+
+	init_timer(&ls->ls_timer);
+	ls->ls_timer.function = dlm_wait_timer_fn;
+	ls->ls_timer.data = (long) ls;
+	ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ);
+	add_timer(&ls->ls_timer);
+
+	wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
+	del_timer_sync(&ls->ls_timer);
+
+	if (dlm_recovery_stopped(ls)) {
+		log_debug(ls, "dlm_wait_function aborted");
+		error = -EINTR;
+	}
+	return error;
+}
+
+/*
+ * An efficient way for all nodes to wait for all others to have a certain
+ * status.  The node with the lowest nodeid polls all the others for their
+ * status (wait_status_all) and all the others poll the node with the low id
+ * for its accumulated result (wait_status_low).  When all nodes have set
+ * status flag X, then status flag X_ALL will be set on the low nodeid.
+ */
+
+uint32_t dlm_recover_status(struct dlm_ls *ls)
+{
+	uint32_t status;
+	spin_lock(&ls->ls_recover_lock);
+	status = ls->ls_recover_status;
+	spin_unlock(&ls->ls_recover_lock);
+	return status;
+}
+
+void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
+{
+	spin_lock(&ls->ls_recover_lock);
+	ls->ls_recover_status |= status;
+	spin_unlock(&ls->ls_recover_lock);
+}
+
+static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
+{
+	struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
+	struct dlm_member *memb;
+	int error = 0, delay;
+
+	list_for_each_entry(memb, &ls->ls_nodes, list) {
+		delay = 0;
+		for (;;) {
+			if (dlm_recovery_stopped(ls)) {
+				error = -EINTR;
+				goto out;
+			}
+
+			error = dlm_rcom_status(ls, memb->nodeid);
+			if (error)
+				goto out;
+
+			if (rc->rc_result & wait_status)
+				break;
+			if (delay < 1000)
+				delay += 20;
+			msleep(delay);
+		}
+	}
+ out:
+	return error;
+}
+
+static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
+{
+	struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf;
+	int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
+
+	for (;;) {
+		if (dlm_recovery_stopped(ls)) {
+			error = -EINTR;
+			goto out;
+		}
+
+		error = dlm_rcom_status(ls, nodeid);
+		if (error)
+			break;
+
+		if (rc->rc_result & wait_status)
+			break;
+		if (delay < 1000)
+			delay += 20;
+		msleep(delay);
+	}
+ out:
+	return error;
+}
+
+static int wait_status(struct dlm_ls *ls, uint32_t status)
+{
+	uint32_t status_all = status << 1;
+	int error;
+
+	if (ls->ls_low_nodeid == dlm_our_nodeid()) {
+		error = wait_status_all(ls, status);
+		if (!error)
+			dlm_set_recover_status(ls, status_all);
+	} else
+		error = wait_status_low(ls, status_all);
+
+	return error;
+}
+
+int dlm_recover_members_wait(struct dlm_ls *ls)
+{
+	return wait_status(ls, DLM_RS_NODES);
+}
+
+int dlm_recover_directory_wait(struct dlm_ls *ls)
+{
+	return wait_status(ls, DLM_RS_DIR);
+}
+
+int dlm_recover_locks_wait(struct dlm_ls *ls)
+{
+	return wait_status(ls, DLM_RS_LOCKS);
+}
+
+int dlm_recover_done_wait(struct dlm_ls *ls)
+{
+	return wait_status(ls, DLM_RS_DONE);
+}
+
+/*
+ * The recover_list contains all the rsb's for which we've requested the new
+ * master nodeid.  As replies are returned from the resource directories the
+ * rsb's are removed from the list.  When the list is empty we're done.
+ *
+ * The recover_list is later similarly used for all rsb's for which we've sent
+ * new lkb's and need to receive new corresponding lkid's.
+ *
+ * We use the address of the rsb struct as a simple local identifier for the
+ * rsb so we can match an rcom reply with the rsb it was sent for.
+ */
+
+static int recover_list_empty(struct dlm_ls *ls)
+{
+	int empty;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	empty = list_empty(&ls->ls_recover_list);
+	spin_unlock(&ls->ls_recover_list_lock);
+
+	return empty;
+}
+
+static void recover_list_add(struct dlm_rsb *r)
+{
+	struct dlm_ls *ls = r->res_ls;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	if (list_empty(&r->res_recover_list)) {
+		list_add_tail(&r->res_recover_list, &ls->ls_recover_list);
+		ls->ls_recover_list_count++;
+		dlm_hold_rsb(r);
+	}
+	spin_unlock(&ls->ls_recover_list_lock);
+}
+
+static void recover_list_del(struct dlm_rsb *r)
+{
+	struct dlm_ls *ls = r->res_ls;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	list_del_init(&r->res_recover_list);
+	ls->ls_recover_list_count--;
+	spin_unlock(&ls->ls_recover_list_lock);
+
+	dlm_put_rsb(r);
+}
+
+static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id)
+{
+	struct dlm_rsb *r = NULL;
+
+	spin_lock(&ls->ls_recover_list_lock);
+
+	list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) {
+		if (id == (unsigned long) r)
+			goto out;
+	}
+	r = NULL;
+ out:
+	spin_unlock(&ls->ls_recover_list_lock);
+	return r;
+}
+
+static void recover_list_clear(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r, *s;
+
+	spin_lock(&ls->ls_recover_list_lock);
+	list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) {
+		list_del_init(&r->res_recover_list);
+		dlm_put_rsb(r);
+		ls->ls_recover_list_count--;
+	}
+
+	if (ls->ls_recover_list_count != 0) {
+		log_error(ls, "warning: recover_list_count %d",
+			  ls->ls_recover_list_count);
+		ls->ls_recover_list_count = 0;
+	}
+	spin_unlock(&ls->ls_recover_list_lock);
+}
+
+
+/* Master recovery: find new master node for rsb's that were
+   mastered on nodes that have been removed.
+
+   dlm_recover_masters
+   recover_master
+   dlm_send_rcom_lookup            ->  receive_rcom_lookup
+                                       dlm_dir_lookup
+   receive_rcom_lookup_reply       <-
+   dlm_recover_master_reply
+   set_new_master
+   set_master_lkbs
+   set_lock_master
+*/
+
+/*
+ * Set the lock master for all LKBs in a lock queue
+ * If we are the new master of the rsb, we may have received new
+ * MSTCPY locks from other nodes already which we need to ignore
+ * when setting the new nodeid.
+ */
+
+static void set_lock_master(struct list_head *queue, int nodeid)
+{
+	struct dlm_lkb *lkb;
+
+	list_for_each_entry(lkb, queue, lkb_statequeue)
+		if (!(lkb->lkb_flags & DLM_IFL_MSTCPY))
+			lkb->lkb_nodeid = nodeid;
+}
+
+static void set_master_lkbs(struct dlm_rsb *r)
+{
+	set_lock_master(&r->res_grantqueue, r->res_nodeid);
+	set_lock_master(&r->res_convertqueue, r->res_nodeid);
+	set_lock_master(&r->res_waitqueue, r->res_nodeid);
+}
+
+/*
+ * Propogate the new master nodeid to locks
+ * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
+ * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which
+ * rsb's to consider.
+ */
+
+static void set_new_master(struct dlm_rsb *r, int nodeid)
+{
+	lock_rsb(r);
+	r->res_nodeid = nodeid;
+	set_master_lkbs(r);
+	rsb_set_flag(r, RSB_NEW_MASTER);
+	rsb_set_flag(r, RSB_NEW_MASTER2);
+	unlock_rsb(r);
+}
+
+/*
+ * We do async lookups on rsb's that need new masters.  The rsb's
+ * waiting for a lookup reply are kept on the recover_list.
+ */
+
+static int recover_master(struct dlm_rsb *r)
+{
+	struct dlm_ls *ls = r->res_ls;
+	int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid();
+
+	dir_nodeid = dlm_dir_nodeid(r);
+
+	if (dir_nodeid == our_nodeid) {
+		error = dlm_dir_lookup(ls, our_nodeid, r->res_name,
+				       r->res_length, &ret_nodeid);
+		if (error)
+			log_error(ls, "recover dir lookup error %d", error);
+
+		if (ret_nodeid == our_nodeid)
+			ret_nodeid = 0;
+		set_new_master(r, ret_nodeid);
+	} else {
+		recover_list_add(r);
+		error = dlm_send_rcom_lookup(r, dir_nodeid);
+	}
+
+	return error;
+}
+
+/*
+ * When not using a directory, most resource names will hash to a new static
+ * master nodeid and the resource will need to be remastered.
+ */
+
+static int recover_master_static(struct dlm_rsb *r)
+{
+	int master = dlm_dir_nodeid(r);
+
+	if (master == dlm_our_nodeid())
+		master = 0;
+
+	if (r->res_nodeid != master) {
+		if (is_master(r))
+			dlm_purge_mstcpy_locks(r);
+		set_new_master(r, master);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Go through local root resources and for each rsb which has a master which
+ * has departed, get the new master nodeid from the directory.  The dir will
+ * assign mastery to the first node to look up the new master.  That means
+ * we'll discover in this lookup if we're the new master of any rsb's.
+ *
+ * We fire off all the dir lookup requests individually and asynchronously to
+ * the correct dir node.
+ */
+
+int dlm_recover_masters(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+	int error = 0, count = 0;
+
+	log_debug(ls, "dlm_recover_masters");
+
+	down_read(&ls->ls_root_sem);
+	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+		if (dlm_recovery_stopped(ls)) {
+			up_read(&ls->ls_root_sem);
+			error = -EINTR;
+			goto out;
+		}
+
+		if (dlm_no_directory(ls))
+			count += recover_master_static(r);
+		else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) {
+			recover_master(r);
+			count++;
+		}
+
+		schedule();
+	}
+	up_read(&ls->ls_root_sem);
+
+	log_debug(ls, "dlm_recover_masters %d resources", count);
+
+	error = dlm_wait_function(ls, &recover_list_empty);
+ out:
+	if (error)
+		recover_list_clear(ls);
+	return error;
+}
+
+int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+	struct dlm_rsb *r;
+	int nodeid;
+
+	r = recover_list_find(ls, rc->rc_id);
+	if (!r) {
+		log_error(ls, "dlm_recover_master_reply no id %llx",
+			  (unsigned long long)rc->rc_id);
+		goto out;
+	}
+
+	nodeid = rc->rc_result;
+	if (nodeid == dlm_our_nodeid())
+		nodeid = 0;
+
+	set_new_master(r, nodeid);
+	recover_list_del(r);
+
+	if (recover_list_empty(ls))
+		wake_up(&ls->ls_wait_general);
+ out:
+	return 0;
+}
+
+
+/* Lock recovery: rebuild the process-copy locks we hold on a
+   remastered rsb on the new rsb master.
+
+   dlm_recover_locks
+   recover_locks
+   recover_locks_queue
+   dlm_send_rcom_lock              ->  receive_rcom_lock
+                                       dlm_recover_master_copy
+   receive_rcom_lock_reply         <-
+   dlm_recover_process_copy
+*/
+
+
+/*
+ * keep a count of the number of lkb's we send to the new master; when we get
+ * an equal number of replies then recovery for the rsb is done
+ */
+
+static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
+{
+	struct dlm_lkb *lkb;
+	int error = 0;
+
+	list_for_each_entry(lkb, head, lkb_statequeue) {
+	   	error = dlm_send_rcom_lock(r, lkb);
+		if (error)
+			break;
+		r->res_recover_locks_count++;
+	}
+
+	return error;
+}
+
+static int recover_locks(struct dlm_rsb *r)
+{
+	int error = 0;
+
+	lock_rsb(r);
+
+	DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r););
+
+	error = recover_locks_queue(r, &r->res_grantqueue);
+	if (error)
+		goto out;
+	error = recover_locks_queue(r, &r->res_convertqueue);
+	if (error)
+		goto out;
+	error = recover_locks_queue(r, &r->res_waitqueue);
+	if (error)
+		goto out;
+
+	if (r->res_recover_locks_count)
+		recover_list_add(r);
+	else
+		rsb_clear_flag(r, RSB_NEW_MASTER);
+ out:
+	unlock_rsb(r);
+	return error;
+}
+
+int dlm_recover_locks(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+	int error, count = 0;
+
+	log_debug(ls, "dlm_recover_locks");
+
+	down_read(&ls->ls_root_sem);
+	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+		if (is_master(r)) {
+			rsb_clear_flag(r, RSB_NEW_MASTER);
+			continue;
+		}
+
+		if (!rsb_flag(r, RSB_NEW_MASTER))
+			continue;
+
+		if (dlm_recovery_stopped(ls)) {
+			error = -EINTR;
+			up_read(&ls->ls_root_sem);
+			goto out;
+		}
+
+		error = recover_locks(r);
+		if (error) {
+			up_read(&ls->ls_root_sem);
+			goto out;
+		}
+
+		count += r->res_recover_locks_count;
+	}
+	up_read(&ls->ls_root_sem);
+
+	log_debug(ls, "dlm_recover_locks %d locks", count);
+
+	error = dlm_wait_function(ls, &recover_list_empty);
+ out:
+	if (error)
+		recover_list_clear(ls);
+	else
+		dlm_set_recover_status(ls, DLM_RS_LOCKS);
+	return error;
+}
+
+void dlm_recovered_lock(struct dlm_rsb *r)
+{
+	DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_dump_rsb(r););
+
+	r->res_recover_locks_count--;
+	if (!r->res_recover_locks_count) {
+		rsb_clear_flag(r, RSB_NEW_MASTER);
+		recover_list_del(r);
+	}
+
+	if (recover_list_empty(r->res_ls))
+		wake_up(&r->res_ls->ls_wait_general);
+}
+
+/*
+ * The lvb needs to be recovered on all master rsb's.  This includes setting
+ * the VALNOTVALID flag if necessary, and determining the correct lvb contents
+ * based on the lvb's of the locks held on the rsb.
+ *
+ * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb.  If it
+ * was already set prior to recovery, it's not cleared, regardless of locks.
+ *
+ * The LVB contents are only considered for changing when this is a new master
+ * of the rsb (NEW_MASTER2).  Then, the rsb's lvb is taken from any lkb with
+ * mode > CR.  If no lkb's exist with mode above CR, the lvb contents are taken
+ * from the lkb with the largest lvb sequence number.
+ */
+
+static void recover_lvb(struct dlm_rsb *r)
+{
+	struct dlm_lkb *lkb, *high_lkb = NULL;
+	uint32_t high_seq = 0;
+	int lock_lvb_exists = 0;
+	int big_lock_exists = 0;
+	int lvblen = r->res_ls->ls_lvblen;
+
+	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+		if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+			continue;
+
+		lock_lvb_exists = 1;
+
+		if (lkb->lkb_grmode > DLM_LOCK_CR) {
+			big_lock_exists = 1;
+			goto setflag;
+		}
+
+		if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
+			high_lkb = lkb;
+			high_seq = lkb->lkb_lvbseq;
+		}
+	}
+
+	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+		if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+			continue;
+
+		lock_lvb_exists = 1;
+
+		if (lkb->lkb_grmode > DLM_LOCK_CR) {
+			big_lock_exists = 1;
+			goto setflag;
+		}
+
+		if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
+			high_lkb = lkb;
+			high_seq = lkb->lkb_lvbseq;
+		}
+	}
+
+ setflag:
+	if (!lock_lvb_exists)
+		goto out;
+
+	if (!big_lock_exists)
+		rsb_set_flag(r, RSB_VALNOTVALID);
+
+	/* don't mess with the lvb unless we're the new master */
+	if (!rsb_flag(r, RSB_NEW_MASTER2))
+		goto out;
+
+	if (!r->res_lvbptr) {
+		r->res_lvbptr = allocate_lvb(r->res_ls);
+		if (!r->res_lvbptr)
+			goto out;
+	}
+
+	if (big_lock_exists) {
+		r->res_lvbseq = lkb->lkb_lvbseq;
+		memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
+	} else if (high_lkb) {
+		r->res_lvbseq = high_lkb->lkb_lvbseq;
+		memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
+	} else {
+		r->res_lvbseq = 0;
+		memset(r->res_lvbptr, 0, lvblen);
+	}
+ out:
+	return;
+}
+
+/* All master rsb's flagged RECOVER_CONVERT need to be looked at.  The locks
+   converting PR->CW or CW->PR need to have their lkb_grmode set. */
+
+static void recover_conversion(struct dlm_rsb *r)
+{
+	struct dlm_lkb *lkb;
+	int grmode = -1;
+
+	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+		if (lkb->lkb_grmode == DLM_LOCK_PR ||
+		    lkb->lkb_grmode == DLM_LOCK_CW) {
+			grmode = lkb->lkb_grmode;
+			break;
+		}
+	}
+
+	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+		if (lkb->lkb_grmode != DLM_LOCK_IV)
+			continue;
+		if (grmode == -1)
+			lkb->lkb_grmode = lkb->lkb_rqmode;
+		else
+			lkb->lkb_grmode = grmode;
+	}
+}
+
+/* We've become the new master for this rsb and waiting/converting locks may
+   need to be granted in dlm_grant_after_purge() due to locks that may have
+   existed from a removed node. */
+
+static void set_locks_purged(struct dlm_rsb *r)
+{
+	if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
+		rsb_set_flag(r, RSB_LOCKS_PURGED);
+}
+
+void dlm_recover_rsbs(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+	int count = 0;
+
+	log_debug(ls, "dlm_recover_rsbs");
+
+	down_read(&ls->ls_root_sem);
+	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
+		lock_rsb(r);
+		if (is_master(r)) {
+			if (rsb_flag(r, RSB_RECOVER_CONVERT))
+				recover_conversion(r);
+			if (rsb_flag(r, RSB_NEW_MASTER2))
+				set_locks_purged(r);
+			recover_lvb(r);
+			count++;
+		}
+		rsb_clear_flag(r, RSB_RECOVER_CONVERT);
+		rsb_clear_flag(r, RSB_NEW_MASTER2);
+		unlock_rsb(r);
+	}
+	up_read(&ls->ls_root_sem);
+
+	log_debug(ls, "dlm_recover_rsbs %d rsbs", count);
+}
+
+/* Create a single list of all root rsb's to be used during recovery */
+
+int dlm_create_root_list(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r;
+	int i, error = 0;
+
+	down_write(&ls->ls_root_sem);
+	if (!list_empty(&ls->ls_root_list)) {
+		log_error(ls, "root list not empty");
+		error = -EINVAL;
+		goto out;
+	}
+
+	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+		read_lock(&ls->ls_rsbtbl[i].lock);
+		list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
+			list_add(&r->res_root_list, &ls->ls_root_list);
+			dlm_hold_rsb(r);
+		}
+		read_unlock(&ls->ls_rsbtbl[i].lock);
+	}
+ out:
+	up_write(&ls->ls_root_sem);
+	return error;
+}
+
+void dlm_release_root_list(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r, *safe;
+
+	down_write(&ls->ls_root_sem);
+	list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) {
+		list_del_init(&r->res_root_list);
+		dlm_put_rsb(r);
+	}
+	up_write(&ls->ls_root_sem);
+}
+
+void dlm_clear_toss_list(struct dlm_ls *ls)
+{
+	struct dlm_rsb *r, *safe;
+	int i;
+
+	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
+		write_lock(&ls->ls_rsbtbl[i].lock);
+		list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
+					 res_hashchain) {
+			list_del(&r->res_hashchain);
+			free_rsb(r);
+		}
+		write_unlock(&ls->ls_rsbtbl[i].lock);
+	}
+}
+
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
new file mode 100644
index 000000000000..ebd0363f1e08
--- /dev/null
+++ b/fs/dlm/recover.h
@@ -0,0 +1,34 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECOVER_DOT_H__
+#define __RECOVER_DOT_H__
+
+int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
+uint32_t dlm_recover_status(struct dlm_ls *ls);
+void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
+int dlm_recover_members_wait(struct dlm_ls *ls);
+int dlm_recover_directory_wait(struct dlm_ls *ls);
+int dlm_recover_locks_wait(struct dlm_ls *ls);
+int dlm_recover_done_wait(struct dlm_ls *ls);
+int dlm_recover_masters(struct dlm_ls *ls);
+int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_recover_locks(struct dlm_ls *ls);
+void dlm_recovered_lock(struct dlm_rsb *r);
+int dlm_create_root_list(struct dlm_ls *ls);
+void dlm_release_root_list(struct dlm_ls *ls);
+void dlm_clear_toss_list(struct dlm_ls *ls);
+void dlm_recover_rsbs(struct dlm_ls *ls);
+
+#endif				/* __RECOVER_DOT_H__ */
+
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
new file mode 100644
index 000000000000..362e3eff4dc9
--- /dev/null
+++ b/fs/dlm/recoverd.c
@@ -0,0 +1,290 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "member.h"
+#include "dir.h"
+#include "ast.h"
+#include "recover.h"
+#include "lowcomms.h"
+#include "lock.h"
+#include "requestqueue.h"
+#include "recoverd.h"
+
+
+/* If the start for which we're re-enabling locking (seq) has been superseded
+   by a newer stop (ls_recover_seq), we need to leave locking disabled. */
+
+static int enable_locking(struct dlm_ls *ls, uint64_t seq)
+{
+	int error = -EINTR;
+
+	spin_lock(&ls->ls_recover_lock);
+	if (ls->ls_recover_seq == seq) {
+		set_bit(LSFL_RUNNING, &ls->ls_flags);
+		up_write(&ls->ls_in_recovery);
+		error = 0;
+	}
+	spin_unlock(&ls->ls_recover_lock);
+	return error;
+}
+
+static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
+{
+	unsigned long start;
+	int error, neg = 0;
+
+	log_debug(ls, "recover %llx", rv->seq);
+
+	mutex_lock(&ls->ls_recoverd_active);
+
+	/*
+	 * Suspending and resuming dlm_astd ensures that no lkb's from this ls
+	 * will be processed by dlm_astd during recovery.
+	 */
+
+	dlm_astd_suspend();
+	dlm_astd_resume();
+
+	/*
+	 * This list of root rsb's will be the basis of most of the recovery
+	 * routines.
+	 */
+
+	dlm_create_root_list(ls);
+
+	/*
+	 * Free all the tossed rsb's so we don't have to recover them.
+	 */
+
+	dlm_clear_toss_list(ls);
+
+	/*
+	 * Add or remove nodes from the lockspace's ls_nodes list.
+	 * Also waits for all nodes to complete dlm_recover_members.
+	 */
+
+	error = dlm_recover_members(ls, rv, &neg);
+	if (error) {
+		log_error(ls, "recover_members failed %d", error);
+		goto fail;
+	}
+	start = jiffies;
+
+	/*
+	 * Rebuild our own share of the directory by collecting from all other
+	 * nodes their master rsb names that hash to us.
+	 */
+
+	error = dlm_recover_directory(ls);
+	if (error) {
+		log_error(ls, "recover_directory failed %d", error);
+		goto fail;
+	}
+
+	/*
+	 * Purge directory-related requests that are saved in requestqueue.
+	 * All dir requests from before recovery are invalid now due to the dir
+	 * rebuild and will be resent by the requesting nodes.
+	 */
+
+	dlm_purge_requestqueue(ls);
+
+	/*
+	 * Wait for all nodes to complete directory rebuild.
+	 */
+
+	error = dlm_recover_directory_wait(ls);
+	if (error) {
+		log_error(ls, "recover_directory_wait failed %d", error);
+		goto fail;
+	}
+
+	/*
+	 * We may have outstanding operations that are waiting for a reply from
+	 * a failed node.  Mark these to be resent after recovery.  Unlock and
+	 * cancel ops can just be completed.
+	 */
+
+	dlm_recover_waiters_pre(ls);
+
+	error = dlm_recovery_stopped(ls);
+	if (error)
+		goto fail;
+
+	if (neg || dlm_no_directory(ls)) {
+		/*
+		 * Clear lkb's for departed nodes.
+		 */
+
+		dlm_purge_locks(ls);
+
+		/*
+		 * Get new master nodeid's for rsb's that were mastered on
+		 * departed nodes.
+		 */
+
+		error = dlm_recover_masters(ls);
+		if (error) {
+			log_error(ls, "recover_masters failed %d", error);
+			goto fail;
+		}
+
+		/*
+		 * Send our locks on remastered rsb's to the new masters.
+		 */
+
+		error = dlm_recover_locks(ls);
+		if (error) {
+			log_error(ls, "recover_locks failed %d", error);
+			goto fail;
+		}
+
+		error = dlm_recover_locks_wait(ls);
+		if (error) {
+			log_error(ls, "recover_locks_wait failed %d", error);
+			goto fail;
+		}
+
+		/*
+		 * Finalize state in master rsb's now that all locks can be
+		 * checked.  This includes conversion resolution and lvb
+		 * settings.
+		 */
+
+		dlm_recover_rsbs(ls);
+	}
+
+	dlm_release_root_list(ls);
+
+	dlm_set_recover_status(ls, DLM_RS_DONE);
+	error = dlm_recover_done_wait(ls);
+	if (error) {
+		log_error(ls, "recover_done_wait failed %d", error);
+		goto fail;
+	}
+
+	dlm_clear_members_gone(ls);
+
+	error = enable_locking(ls, rv->seq);
+	if (error) {
+		log_error(ls, "enable_locking failed %d", error);
+		goto fail;
+	}
+
+	error = dlm_process_requestqueue(ls);
+	if (error) {
+		log_error(ls, "process_requestqueue failed %d", error);
+		goto fail;
+	}
+
+	error = dlm_recover_waiters_post(ls);
+	if (error) {
+		log_error(ls, "recover_waiters_post failed %d", error);
+		goto fail;
+	}
+
+	dlm_grant_after_purge(ls);
+
+	dlm_astd_wake();
+
+	log_debug(ls, "recover %llx done: %u ms", rv->seq,
+		  jiffies_to_msecs(jiffies - start));
+	mutex_unlock(&ls->ls_recoverd_active);
+
+	return 0;
+
+ fail:
+	dlm_release_root_list(ls);
+	log_debug(ls, "recover %llx error %d", rv->seq, error);
+	mutex_unlock(&ls->ls_recoverd_active);
+	return error;
+}
+
+static void do_ls_recovery(struct dlm_ls *ls)
+{
+	struct dlm_recover *rv = NULL;
+
+	spin_lock(&ls->ls_recover_lock);
+	rv = ls->ls_recover_args;
+	ls->ls_recover_args = NULL;
+	clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags);
+	spin_unlock(&ls->ls_recover_lock);
+
+	if (rv) {
+		ls_recover(ls, rv);
+		kfree(rv->nodeids);
+		kfree(rv);
+	}
+}
+
+static int dlm_recoverd(void *arg)
+{
+	struct dlm_ls *ls;
+
+	ls = dlm_find_lockspace_local(arg);
+	if (!ls) {
+		log_print("dlm_recoverd: no lockspace %p", arg);
+		return -1;
+	}
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (!test_bit(LSFL_WORK, &ls->ls_flags))
+			schedule();
+		set_current_state(TASK_RUNNING);
+
+		if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags))
+			do_ls_recovery(ls);
+	}
+
+	dlm_put_lockspace(ls);
+	return 0;
+}
+
+void dlm_recoverd_kick(struct dlm_ls *ls)
+{
+	set_bit(LSFL_WORK, &ls->ls_flags);
+	wake_up_process(ls->ls_recoverd_task);
+}
+
+int dlm_recoverd_start(struct dlm_ls *ls)
+{
+	struct task_struct *p;
+	int error = 0;
+
+	p = kthread_run(dlm_recoverd, ls, "dlm_recoverd");
+	if (IS_ERR(p))
+		error = PTR_ERR(p);
+	else
+                ls->ls_recoverd_task = p;
+	return error;
+}
+
+void dlm_recoverd_stop(struct dlm_ls *ls)
+{
+	kthread_stop(ls->ls_recoverd_task);
+}
+
+void dlm_recoverd_suspend(struct dlm_ls *ls)
+{
+	wake_up(&ls->ls_wait_general);
+	mutex_lock(&ls->ls_recoverd_active);
+}
+
+void dlm_recoverd_resume(struct dlm_ls *ls)
+{
+	mutex_unlock(&ls->ls_recoverd_active);
+}
+
diff --git a/fs/dlm/recoverd.h b/fs/dlm/recoverd.h
new file mode 100644
index 000000000000..866657c5d69d
--- /dev/null
+++ b/fs/dlm/recoverd.h
@@ -0,0 +1,24 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __RECOVERD_DOT_H__
+#define __RECOVERD_DOT_H__
+
+void dlm_recoverd_kick(struct dlm_ls *ls);
+void dlm_recoverd_stop(struct dlm_ls *ls);
+int dlm_recoverd_start(struct dlm_ls *ls);
+void dlm_recoverd_suspend(struct dlm_ls *ls);
+void dlm_recoverd_resume(struct dlm_ls *ls);
+
+#endif				/* __RECOVERD_DOT_H__ */
+
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
new file mode 100644
index 000000000000..7b2b089634a2
--- /dev/null
+++ b/fs/dlm/requestqueue.c
@@ -0,0 +1,184 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "member.h"
+#include "lock.h"
+#include "dir.h"
+#include "config.h"
+#include "requestqueue.h"
+
+struct rq_entry {
+	struct list_head list;
+	int nodeid;
+	char request[1];
+};
+
+/*
+ * Requests received while the lockspace is in recovery get added to the
+ * request queue and processed when recovery is complete.  This happens when
+ * the lockspace is suspended on some nodes before it is on others, or the
+ * lockspace is enabled on some while still suspended on others.
+ */
+
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
+{
+	struct rq_entry *e;
+	int length = hd->h_length;
+
+	if (dlm_is_removed(ls, nodeid))
+		return;
+
+	e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
+	if (!e) {
+		log_print("dlm_add_requestqueue: out of memory\n");
+		return;
+	}
+
+	e->nodeid = nodeid;
+	memcpy(e->request, hd, length);
+
+	mutex_lock(&ls->ls_requestqueue_mutex);
+	list_add_tail(&e->list, &ls->ls_requestqueue);
+	mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+
+int dlm_process_requestqueue(struct dlm_ls *ls)
+{
+	struct rq_entry *e;
+	struct dlm_header *hd;
+	int error = 0;
+
+	mutex_lock(&ls->ls_requestqueue_mutex);
+
+	for (;;) {
+		if (list_empty(&ls->ls_requestqueue)) {
+			mutex_unlock(&ls->ls_requestqueue_mutex);
+			error = 0;
+			break;
+		}
+		e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
+		mutex_unlock(&ls->ls_requestqueue_mutex);
+
+		hd = (struct dlm_header *) e->request;
+		error = dlm_receive_message(hd, e->nodeid, 1);
+
+		if (error == -EINTR) {
+			/* entry is left on requestqueue */
+			log_debug(ls, "process_requestqueue abort eintr");
+			break;
+		}
+
+		mutex_lock(&ls->ls_requestqueue_mutex);
+		list_del(&e->list);
+		kfree(e);
+
+		if (dlm_locking_stopped(ls)) {
+			log_debug(ls, "process_requestqueue abort running");
+			mutex_unlock(&ls->ls_requestqueue_mutex);
+			error = -EINTR;
+			break;
+		}
+		schedule();
+	}
+
+	return error;
+}
+
+/*
+ * After recovery is done, locking is resumed and dlm_recoverd takes all the
+ * saved requests and processes them as they would have been by dlm_recvd.  At
+ * the same time, dlm_recvd will start receiving new requests from remote
+ * nodes.  We want to delay dlm_recvd processing new requests until
+ * dlm_recoverd has finished processing the old saved requests.
+ */
+
+void dlm_wait_requestqueue(struct dlm_ls *ls)
+{
+	for (;;) {
+		mutex_lock(&ls->ls_requestqueue_mutex);
+		if (list_empty(&ls->ls_requestqueue))
+			break;
+		if (dlm_locking_stopped(ls))
+			break;
+		mutex_unlock(&ls->ls_requestqueue_mutex);
+		schedule();
+	}
+	mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+
+static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid)
+{
+	uint32_t type = ms->m_type;
+
+	if (dlm_is_removed(ls, nodeid))
+		return 1;
+
+	/* directory operations are always purged because the directory is
+	   always rebuilt during recovery and the lookups resent */
+
+	if (type == DLM_MSG_REMOVE ||
+	    type == DLM_MSG_LOOKUP ||
+	    type == DLM_MSG_LOOKUP_REPLY)
+		return 1;
+
+	if (!dlm_no_directory(ls))
+		return 0;
+
+	/* with no directory, the master is likely to change as a part of
+	   recovery; requests to/from the defunct master need to be purged */
+
+	switch (type) {
+	case DLM_MSG_REQUEST:
+	case DLM_MSG_CONVERT:
+	case DLM_MSG_UNLOCK:
+	case DLM_MSG_CANCEL:
+		/* we're no longer the master of this resource, the sender
+		   will resend to the new master (see waiter_needs_recovery) */
+
+		if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid())
+			return 1;
+		break;
+
+	case DLM_MSG_REQUEST_REPLY:
+	case DLM_MSG_CONVERT_REPLY:
+	case DLM_MSG_UNLOCK_REPLY:
+	case DLM_MSG_CANCEL_REPLY:
+	case DLM_MSG_GRANT:
+		/* this reply is from the former master of the resource,
+		   we'll resend to the new master if needed */
+
+		if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid)
+			return 1;
+		break;
+	}
+
+	return 0;
+}
+
+void dlm_purge_requestqueue(struct dlm_ls *ls)
+{
+	struct dlm_message *ms;
+	struct rq_entry *e, *safe;
+
+	mutex_lock(&ls->ls_requestqueue_mutex);
+	list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) {
+		ms = (struct dlm_message *) e->request;
+
+		if (purge_request(ls, ms, e->nodeid)) {
+			list_del(&e->list);
+			kfree(e);
+		}
+	}
+	mutex_unlock(&ls->ls_requestqueue_mutex);
+}
+
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
new file mode 100644
index 000000000000..349f0d292d95
--- /dev/null
+++ b/fs/dlm/requestqueue.h
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __REQUESTQUEUE_DOT_H__
+#define __REQUESTQUEUE_DOT_H__
+
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
+int dlm_process_requestqueue(struct dlm_ls *ls);
+void dlm_wait_requestqueue(struct dlm_ls *ls);
+void dlm_purge_requestqueue(struct dlm_ls *ls);
+
+#endif
+
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
new file mode 100644
index 000000000000..c37e93e4f2df
--- /dev/null
+++ b/fs/dlm/user.c
@@ -0,0 +1,788 @@
+/*
+ * Copyright (C) 2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/signal.h>
+#include <linux/spinlock.h>
+#include <linux/dlm.h>
+#include <linux/dlm_device.h>
+
+#include "dlm_internal.h"
+#include "lockspace.h"
+#include "lock.h"
+#include "lvb_table.h"
+
+static const char *name_prefix="dlm";
+static struct miscdevice ctl_device;
+static struct file_operations device_fops;
+
+#ifdef CONFIG_COMPAT
+
+struct dlm_lock_params32 {
+	__u8 mode;
+	__u8 namelen;
+	__u16 flags;
+	__u32 lkid;
+	__u32 parent;
+
+	__u32 castparam;
+	__u32 castaddr;
+	__u32 bastparam;
+	__u32 bastaddr;
+	__u32 lksb;
+
+	char lvb[DLM_USER_LVB_LEN];
+	char name[0];
+};
+
+struct dlm_write_request32 {
+	__u32 version[3];
+	__u8 cmd;
+	__u8 is64bit;
+	__u8 unused[2];
+
+	union  {
+		struct dlm_lock_params32 lock;
+		struct dlm_lspace_params lspace;
+	} i;
+};
+
+struct dlm_lksb32 {
+	__u32 sb_status;
+	__u32 sb_lkid;
+	__u8 sb_flags;
+	__u32 sb_lvbptr;
+};
+
+struct dlm_lock_result32 {
+	__u32 length;
+	__u32 user_astaddr;
+	__u32 user_astparam;
+	__u32 user_lksb;
+	struct dlm_lksb32 lksb;
+	__u8 bast_mode;
+	__u8 unused[3];
+	/* Offsets may be zero if no data is present */
+	__u32 lvb_offset;
+};
+
+static void compat_input(struct dlm_write_request *kb,
+			 struct dlm_write_request32 *kb32)
+{
+	kb->version[0] = kb32->version[0];
+	kb->version[1] = kb32->version[1];
+	kb->version[2] = kb32->version[2];
+
+	kb->cmd = kb32->cmd;
+	kb->is64bit = kb32->is64bit;
+	if (kb->cmd == DLM_USER_CREATE_LOCKSPACE ||
+	    kb->cmd == DLM_USER_REMOVE_LOCKSPACE) {
+		kb->i.lspace.flags = kb32->i.lspace.flags;
+		kb->i.lspace.minor = kb32->i.lspace.minor;
+		strcpy(kb->i.lspace.name, kb32->i.lspace.name);
+	} else {
+		kb->i.lock.mode = kb32->i.lock.mode;
+		kb->i.lock.namelen = kb32->i.lock.namelen;
+		kb->i.lock.flags = kb32->i.lock.flags;
+		kb->i.lock.lkid = kb32->i.lock.lkid;
+		kb->i.lock.parent = kb32->i.lock.parent;
+		kb->i.lock.castparam = (void *)(long)kb32->i.lock.castparam;
+		kb->i.lock.castaddr = (void *)(long)kb32->i.lock.castaddr;
+		kb->i.lock.bastparam = (void *)(long)kb32->i.lock.bastparam;
+		kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
+		kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
+		memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
+		memcpy(kb->i.lock.name, kb32->i.lock.name, kb->i.lock.namelen);
+	}
+}
+
+static void compat_output(struct dlm_lock_result *res,
+			  struct dlm_lock_result32 *res32)
+{
+	res32->length = res->length - (sizeof(struct dlm_lock_result) -
+				       sizeof(struct dlm_lock_result32));
+	res32->user_astaddr = (__u32)(long)res->user_astaddr;
+	res32->user_astparam = (__u32)(long)res->user_astparam;
+	res32->user_lksb = (__u32)(long)res->user_lksb;
+	res32->bast_mode = res->bast_mode;
+
+	res32->lvb_offset = res->lvb_offset;
+	res32->length = res->length;
+
+	res32->lksb.sb_status = res->lksb.sb_status;
+	res32->lksb.sb_flags = res->lksb.sb_flags;
+	res32->lksb.sb_lkid = res->lksb.sb_lkid;
+	res32->lksb.sb_lvbptr = (__u32)(long)res->lksb.sb_lvbptr;
+}
+#endif
+
+
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
+{
+	struct dlm_ls *ls;
+	struct dlm_user_args *ua;
+	struct dlm_user_proc *proc;
+	int remove_ownqueue = 0;
+
+	/* dlm_clear_proc_locks() sets ORPHAN/DEAD flag on each
+	   lkb before dealing with it.  We need to check this
+	   flag before taking ls_clear_proc_locks mutex because if
+	   it's set, dlm_clear_proc_locks() holds the mutex. */
+
+	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
+		/* log_print("user_add_ast skip1 %x", lkb->lkb_flags); */
+		return;
+	}
+
+	ls = lkb->lkb_resource->res_ls;
+	mutex_lock(&ls->ls_clear_proc_locks);
+
+	/* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
+	   can't be delivered.  For ORPHAN's, dlm_clear_proc_locks() freed
+	   lkb->ua so we can't try to use it. */
+
+	if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) {
+		/* log_print("user_add_ast skip2 %x", lkb->lkb_flags); */
+		goto out;
+	}
+
+	DLM_ASSERT(lkb->lkb_astparam, dlm_print_lkb(lkb););
+	ua = (struct dlm_user_args *)lkb->lkb_astparam;
+	proc = ua->proc;
+
+	if (type == AST_BAST && ua->bastaddr == NULL)
+		goto out;
+
+	spin_lock(&proc->asts_spin);
+	if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
+		kref_get(&lkb->lkb_ref);
+		list_add_tail(&lkb->lkb_astqueue, &proc->asts);
+		lkb->lkb_ast_type |= type;
+		wake_up_interruptible(&proc->wait);
+	}
+
+	/* noqueue requests that fail may need to be removed from the
+	   proc's locks list, there should be a better way of detecting
+	   this situation than checking all these things... */
+
+	if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV &&
+	    ua->lksb.sb_status == -EAGAIN && !list_empty(&lkb->lkb_ownqueue))
+		remove_ownqueue = 1;
+
+	/* We want to copy the lvb to userspace when the completion
+	   ast is read if the status is 0, the lock has an lvb and
+	   lvb_ops says we should.  We could probably have set_lvb_lock()
+	   set update_user_lvb instead and not need old_mode */
+
+	if ((lkb->lkb_ast_type & AST_COMP) &&
+	    (lkb->lkb_lksb->sb_status == 0) &&
+	    lkb->lkb_lksb->sb_lvbptr &&
+	    dlm_lvb_operations[ua->old_mode + 1][lkb->lkb_grmode + 1])
+		ua->update_user_lvb = 1;
+	else
+		ua->update_user_lvb = 0;
+
+	spin_unlock(&proc->asts_spin);
+
+	if (remove_ownqueue) {
+		spin_lock(&ua->proc->locks_spin);
+		list_del_init(&lkb->lkb_ownqueue);
+		spin_unlock(&ua->proc->locks_spin);
+		dlm_put_lkb(lkb);
+	}
+ out:
+	mutex_unlock(&ls->ls_clear_proc_locks);
+}
+
+static int device_user_lock(struct dlm_user_proc *proc,
+			    struct dlm_lock_params *params)
+{
+	struct dlm_ls *ls;
+	struct dlm_user_args *ua;
+	int error = -ENOMEM;
+
+	ls = dlm_find_lockspace_local(proc->lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	if (!params->castaddr || !params->lksb) {
+		error = -EINVAL;
+		goto out;
+	}
+
+	ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+	if (!ua)
+		goto out;
+	ua->proc = proc;
+	ua->user_lksb = params->lksb;
+	ua->castparam = params->castparam;
+	ua->castaddr = params->castaddr;
+	ua->bastparam = params->bastparam;
+	ua->bastaddr = params->bastaddr;
+
+	if (params->flags & DLM_LKF_CONVERT)
+		error = dlm_user_convert(ls, ua,
+				         params->mode, params->flags,
+				         params->lkid, params->lvb);
+	else {
+		error = dlm_user_request(ls, ua,
+					 params->mode, params->flags,
+					 params->name, params->namelen,
+					 params->parent);
+		if (!error)
+			error = ua->lksb.sb_lkid;
+	}
+ out:
+	dlm_put_lockspace(ls);
+	return error;
+}
+
+static int device_user_unlock(struct dlm_user_proc *proc,
+			      struct dlm_lock_params *params)
+{
+	struct dlm_ls *ls;
+	struct dlm_user_args *ua;
+	int error = -ENOMEM;
+
+	ls = dlm_find_lockspace_local(proc->lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
+	if (!ua)
+		goto out;
+	ua->proc = proc;
+	ua->user_lksb = params->lksb;
+	ua->castparam = params->castparam;
+	ua->castaddr = params->castaddr;
+
+	if (params->flags & DLM_LKF_CANCEL)
+		error = dlm_user_cancel(ls, ua, params->flags, params->lkid);
+	else
+		error = dlm_user_unlock(ls, ua, params->flags, params->lkid,
+					params->lvb);
+ out:
+	dlm_put_lockspace(ls);
+	return error;
+}
+
+static int device_create_lockspace(struct dlm_lspace_params *params)
+{
+	dlm_lockspace_t *lockspace;
+	struct dlm_ls *ls;
+	int error, len;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	error = dlm_new_lockspace(params->name, strlen(params->name),
+				  &lockspace, 0, DLM_USER_LVB_LEN);
+	if (error)
+		return error;
+
+	ls = dlm_find_lockspace_local(lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	error = -ENOMEM;
+	len = strlen(params->name) + strlen(name_prefix) + 2;
+	ls->ls_device.name = kzalloc(len, GFP_KERNEL);
+	if (!ls->ls_device.name)
+		goto fail;
+	snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix,
+		 params->name);
+	ls->ls_device.fops = &device_fops;
+	ls->ls_device.minor = MISC_DYNAMIC_MINOR;
+
+	error = misc_register(&ls->ls_device);
+	if (error) {
+		kfree(ls->ls_device.name);
+		goto fail;
+	}
+
+	error = ls->ls_device.minor;
+	dlm_put_lockspace(ls);
+	return error;
+
+ fail:
+	dlm_put_lockspace(ls);
+	dlm_release_lockspace(lockspace, 0);
+	return error;
+}
+
+static int device_remove_lockspace(struct dlm_lspace_params *params)
+{
+	dlm_lockspace_t *lockspace;
+	struct dlm_ls *ls;
+	int error, force = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ls = dlm_find_lockspace_device(params->minor);
+	if (!ls)
+		return -ENOENT;
+
+	error = misc_deregister(&ls->ls_device);
+	if (error) {
+		dlm_put_lockspace(ls);
+		goto out;
+	}
+	kfree(ls->ls_device.name);
+
+	if (params->flags & DLM_USER_LSFLG_FORCEFREE)
+		force = 2;
+
+	lockspace = ls->ls_local_handle;
+
+	/* dlm_release_lockspace waits for references to go to zero,
+	   so all processes will need to close their device for the ls
+	   before the release will procede */
+
+	dlm_put_lockspace(ls);
+	error = dlm_release_lockspace(lockspace, force);
+ out:
+	return error;
+}
+
+/* Check the user's version matches ours */
+static int check_version(struct dlm_write_request *req)
+{
+	if (req->version[0] != DLM_DEVICE_VERSION_MAJOR ||
+	    (req->version[0] == DLM_DEVICE_VERSION_MAJOR &&
+	     req->version[1] > DLM_DEVICE_VERSION_MINOR)) {
+
+		printk(KERN_DEBUG "dlm: process %s (%d) version mismatch "
+		       "user (%d.%d.%d) kernel (%d.%d.%d)\n",
+		       current->comm,
+		       current->pid,
+		       req->version[0],
+		       req->version[1],
+		       req->version[2],
+		       DLM_DEVICE_VERSION_MAJOR,
+		       DLM_DEVICE_VERSION_MINOR,
+		       DLM_DEVICE_VERSION_PATCH);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * device_write
+ *
+ *   device_user_lock
+ *     dlm_user_request -> request_lock
+ *     dlm_user_convert -> convert_lock
+ *
+ *   device_user_unlock
+ *     dlm_user_unlock -> unlock_lock
+ *     dlm_user_cancel -> cancel_lock
+ *
+ *   device_create_lockspace
+ *     dlm_new_lockspace
+ *
+ *   device_remove_lockspace
+ *     dlm_release_lockspace
+ */
+
+/* a write to a lockspace device is a lock or unlock request, a write
+   to the control device is to create/remove a lockspace */
+
+static ssize_t device_write(struct file *file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	struct dlm_user_proc *proc = file->private_data;
+	struct dlm_write_request *kbuf;
+	sigset_t tmpsig, allsigs;
+	int error;
+
+#ifdef CONFIG_COMPAT
+	if (count < sizeof(struct dlm_write_request32))
+#else
+	if (count < sizeof(struct dlm_write_request))
+#endif
+		return -EINVAL;
+
+	kbuf = kmalloc(count, GFP_KERNEL);
+	if (!kbuf)
+		return -ENOMEM;
+
+	if (copy_from_user(kbuf, buf, count)) {
+		error = -EFAULT;
+		goto out_free;
+	}
+
+	if (check_version(kbuf)) {
+		error = -EBADE;
+		goto out_free;
+	}
+
+#ifdef CONFIG_COMPAT
+	if (!kbuf->is64bit) {
+		struct dlm_write_request32 *k32buf;
+		k32buf = (struct dlm_write_request32 *)kbuf;
+		kbuf = kmalloc(count + (sizeof(struct dlm_write_request) -
+			       sizeof(struct dlm_write_request32)), GFP_KERNEL);
+		if (!kbuf)
+			return -ENOMEM;
+
+		if (proc)
+			set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
+		compat_input(kbuf, k32buf);
+		kfree(k32buf);
+	}
+#endif
+
+	/* do we really need this? can a write happen after a close? */
+	if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
+	    test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
+		return -EINVAL;
+
+	sigfillset(&allsigs);
+	sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
+
+	error = -EINVAL;
+
+	switch (kbuf->cmd)
+	{
+	case DLM_USER_LOCK:
+		if (!proc) {
+			log_print("no locking on control device");
+			goto out_sig;
+		}
+		error = device_user_lock(proc, &kbuf->i.lock);
+		break;
+
+	case DLM_USER_UNLOCK:
+		if (!proc) {
+			log_print("no locking on control device");
+			goto out_sig;
+		}
+		error = device_user_unlock(proc, &kbuf->i.lock);
+		break;
+
+	case DLM_USER_CREATE_LOCKSPACE:
+		if (proc) {
+			log_print("create/remove only on control device");
+			goto out_sig;
+		}
+		error = device_create_lockspace(&kbuf->i.lspace);
+		break;
+
+	case DLM_USER_REMOVE_LOCKSPACE:
+		if (proc) {
+			log_print("create/remove only on control device");
+			goto out_sig;
+		}
+		error = device_remove_lockspace(&kbuf->i.lspace);
+		break;
+
+	default:
+		log_print("Unknown command passed to DLM device : %d\n",
+			  kbuf->cmd);
+	}
+
+ out_sig:
+	sigprocmask(SIG_SETMASK, &tmpsig, NULL);
+	recalc_sigpending();
+ out_free:
+	kfree(kbuf);
+	return error;
+}
+
+/* Every process that opens the lockspace device has its own "proc" structure
+   hanging off the open file that's used to keep track of locks owned by the
+   process and asts that need to be delivered to the process. */
+
+static int device_open(struct inode *inode, struct file *file)
+{
+	struct dlm_user_proc *proc;
+	struct dlm_ls *ls;
+
+	ls = dlm_find_lockspace_device(iminor(inode));
+	if (!ls)
+		return -ENOENT;
+
+	proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
+	if (!proc) {
+		dlm_put_lockspace(ls);
+		return -ENOMEM;
+	}
+
+	proc->lockspace = ls->ls_local_handle;
+	INIT_LIST_HEAD(&proc->asts);
+	INIT_LIST_HEAD(&proc->locks);
+	spin_lock_init(&proc->asts_spin);
+	spin_lock_init(&proc->locks_spin);
+	init_waitqueue_head(&proc->wait);
+	file->private_data = proc;
+
+	return 0;
+}
+
+static int device_close(struct inode *inode, struct file *file)
+{
+	struct dlm_user_proc *proc = file->private_data;
+	struct dlm_ls *ls;
+	sigset_t tmpsig, allsigs;
+
+	ls = dlm_find_lockspace_local(proc->lockspace);
+	if (!ls)
+		return -ENOENT;
+
+	sigfillset(&allsigs);
+	sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
+
+	set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags);
+
+	dlm_clear_proc_locks(ls, proc);
+
+	/* at this point no more lkb's should exist for this lockspace,
+	   so there's no chance of dlm_user_add_ast() being called and
+	   looking for lkb->ua->proc */
+
+	kfree(proc);
+	file->private_data = NULL;
+
+	dlm_put_lockspace(ls);
+	dlm_put_lockspace(ls);  /* for the find in device_open() */
+
+	/* FIXME: AUTOFREE: if this ls is no longer used do
+	   device_remove_lockspace() */
+
+	sigprocmask(SIG_SETMASK, &tmpsig, NULL);
+	recalc_sigpending();
+
+	return 0;
+}
+
+static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
+			       int bmode, char __user *buf, size_t count)
+{
+#ifdef CONFIG_COMPAT
+	struct dlm_lock_result32 result32;
+#endif
+	struct dlm_lock_result result;
+	void *resultptr;
+	int error=0;
+	int len;
+	int struct_len;
+
+	memset(&result, 0, sizeof(struct dlm_lock_result));
+	memcpy(&result.lksb, &ua->lksb, sizeof(struct dlm_lksb));
+	result.user_lksb = ua->user_lksb;
+
+	/* FIXME: dlm1 provides for the user's bastparam/addr to not be updated
+	   in a conversion unless the conversion is successful.  See code
+	   in dlm_user_convert() for updating ua from ua_tmp.  OpenVMS, though,
+	   notes that a new blocking AST address and parameter are set even if
+	   the conversion fails, so maybe we should just do that. */
+
+	if (type == AST_BAST) {
+		result.user_astaddr = ua->bastaddr;
+		result.user_astparam = ua->bastparam;
+		result.bast_mode = bmode;
+	} else {
+		result.user_astaddr = ua->castaddr;
+		result.user_astparam = ua->castparam;
+	}
+
+#ifdef CONFIG_COMPAT
+	if (compat)
+		len = sizeof(struct dlm_lock_result32);
+	else
+#endif
+		len = sizeof(struct dlm_lock_result);
+	struct_len = len;
+
+	/* copy lvb to userspace if there is one, it's been updated, and
+	   the user buffer has space for it */
+
+	if (ua->update_user_lvb && ua->lksb.sb_lvbptr &&
+	    count >= len + DLM_USER_LVB_LEN) {
+		if (copy_to_user(buf+len, ua->lksb.sb_lvbptr,
+				 DLM_USER_LVB_LEN)) {
+			error = -EFAULT;
+			goto out;
+		}
+
+		result.lvb_offset = len;
+		len += DLM_USER_LVB_LEN;
+	}
+
+	result.length = len;
+	resultptr = &result;
+#ifdef CONFIG_COMPAT
+	if (compat) {
+		compat_output(&result, &result32);
+		resultptr = &result32;
+	}
+#endif
+
+	if (copy_to_user(buf, resultptr, struct_len))
+		error = -EFAULT;
+	else
+		error = len;
+ out:
+	return error;
+}
+
+/* a read returns a single ast described in a struct dlm_lock_result */
+
+static ssize_t device_read(struct file *file, char __user *buf, size_t count,
+			   loff_t *ppos)
+{
+	struct dlm_user_proc *proc = file->private_data;
+	struct dlm_lkb *lkb;
+	struct dlm_user_args *ua;
+	DECLARE_WAITQUEUE(wait, current);
+	int error, type=0, bmode=0, removed = 0;
+
+#ifdef CONFIG_COMPAT
+	if (count < sizeof(struct dlm_lock_result32))
+#else
+	if (count < sizeof(struct dlm_lock_result))
+#endif
+		return -EINVAL;
+
+	/* do we really need this? can a read happen after a close? */
+	if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
+		return -EINVAL;
+
+	spin_lock(&proc->asts_spin);
+	if (list_empty(&proc->asts)) {
+		if (file->f_flags & O_NONBLOCK) {
+			spin_unlock(&proc->asts_spin);
+			return -EAGAIN;
+		}
+
+		add_wait_queue(&proc->wait, &wait);
+
+	repeat:
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (list_empty(&proc->asts) && !signal_pending(current)) {
+			spin_unlock(&proc->asts_spin);
+			schedule();
+			spin_lock(&proc->asts_spin);
+			goto repeat;
+		}
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&proc->wait, &wait);
+
+		if (signal_pending(current)) {
+			spin_unlock(&proc->asts_spin);
+			return -ERESTARTSYS;
+		}
+	}
+
+	if (list_empty(&proc->asts)) {
+		spin_unlock(&proc->asts_spin);
+		return -EAGAIN;
+	}
+
+	/* there may be both completion and blocking asts to return for
+	   the lkb, don't remove lkb from asts list unless no asts remain */
+
+	lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
+
+	if (lkb->lkb_ast_type & AST_COMP) {
+		lkb->lkb_ast_type &= ~AST_COMP;
+		type = AST_COMP;
+	} else if (lkb->lkb_ast_type & AST_BAST) {
+		lkb->lkb_ast_type &= ~AST_BAST;
+		type = AST_BAST;
+		bmode = lkb->lkb_bastmode;
+	}
+
+	if (!lkb->lkb_ast_type) {
+		list_del(&lkb->lkb_astqueue);
+		removed = 1;
+	}
+	spin_unlock(&proc->asts_spin);
+
+	ua = (struct dlm_user_args *)lkb->lkb_astparam;
+	error = copy_result_to_user(ua,
+			 	test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+				type, bmode, buf, count);
+
+	/* removes reference for the proc->asts lists added by
+	   dlm_user_add_ast() and may result in the lkb being freed */
+	if (removed)
+		dlm_put_lkb(lkb);
+
+	return error;
+}
+
+static unsigned int device_poll(struct file *file, poll_table *wait)
+{
+	struct dlm_user_proc *proc = file->private_data;
+
+	poll_wait(file, &proc->wait, wait);
+
+	spin_lock(&proc->asts_spin);
+	if (!list_empty(&proc->asts)) {
+		spin_unlock(&proc->asts_spin);
+		return POLLIN | POLLRDNORM;
+	}
+	spin_unlock(&proc->asts_spin);
+	return 0;
+}
+
+static int ctl_device_open(struct inode *inode, struct file *file)
+{
+	file->private_data = NULL;
+	return 0;
+}
+
+static int ctl_device_close(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static struct file_operations device_fops = {
+	.open    = device_open,
+	.release = device_close,
+	.read    = device_read,
+	.write   = device_write,
+	.poll    = device_poll,
+	.owner   = THIS_MODULE,
+};
+
+static struct file_operations ctl_device_fops = {
+	.open    = ctl_device_open,
+	.release = ctl_device_close,
+	.write   = device_write,
+	.owner   = THIS_MODULE,
+};
+
+int dlm_user_init(void)
+{
+	int error;
+
+	ctl_device.name = "dlm-control";
+	ctl_device.fops = &ctl_device_fops;
+	ctl_device.minor = MISC_DYNAMIC_MINOR;
+
+	error = misc_register(&ctl_device);
+	if (error)
+		log_print("misc_register failed for control device");
+
+	return error;
+}
+
+void dlm_user_exit(void)
+{
+	misc_deregister(&ctl_device);
+}
+
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
new file mode 100644
index 000000000000..d38e9f3e4151
--- /dev/null
+++ b/fs/dlm/user.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright (C) 2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License v.2.
+ */
+
+#ifndef __USER_DOT_H__
+#define __USER_DOT_H__
+
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
+int dlm_user_init(void);
+void dlm_user_exit(void);
+
+#endif
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
new file mode 100644
index 000000000000..767197db9944
--- /dev/null
+++ b/fs/dlm/util.c
@@ -0,0 +1,161 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#include "dlm_internal.h"
+#include "rcom.h"
+#include "util.h"
+
+static void header_out(struct dlm_header *hd)
+{
+	hd->h_version		= cpu_to_le32(hd->h_version);
+	hd->h_lockspace		= cpu_to_le32(hd->h_lockspace);
+	hd->h_nodeid		= cpu_to_le32(hd->h_nodeid);
+	hd->h_length		= cpu_to_le16(hd->h_length);
+}
+
+static void header_in(struct dlm_header *hd)
+{
+	hd->h_version		= le32_to_cpu(hd->h_version);
+	hd->h_lockspace		= le32_to_cpu(hd->h_lockspace);
+	hd->h_nodeid		= le32_to_cpu(hd->h_nodeid);
+	hd->h_length		= le16_to_cpu(hd->h_length);
+}
+
+void dlm_message_out(struct dlm_message *ms)
+{
+	struct dlm_header *hd = (struct dlm_header *) ms;
+
+	header_out(hd);
+
+	ms->m_type		= cpu_to_le32(ms->m_type);
+	ms->m_nodeid		= cpu_to_le32(ms->m_nodeid);
+	ms->m_pid		= cpu_to_le32(ms->m_pid);
+	ms->m_lkid		= cpu_to_le32(ms->m_lkid);
+	ms->m_remid		= cpu_to_le32(ms->m_remid);
+	ms->m_parent_lkid	= cpu_to_le32(ms->m_parent_lkid);
+	ms->m_parent_remid	= cpu_to_le32(ms->m_parent_remid);
+	ms->m_exflags		= cpu_to_le32(ms->m_exflags);
+	ms->m_sbflags		= cpu_to_le32(ms->m_sbflags);
+	ms->m_flags		= cpu_to_le32(ms->m_flags);
+	ms->m_lvbseq		= cpu_to_le32(ms->m_lvbseq);
+	ms->m_hash		= cpu_to_le32(ms->m_hash);
+	ms->m_status		= cpu_to_le32(ms->m_status);
+	ms->m_grmode		= cpu_to_le32(ms->m_grmode);
+	ms->m_rqmode		= cpu_to_le32(ms->m_rqmode);
+	ms->m_bastmode		= cpu_to_le32(ms->m_bastmode);
+	ms->m_asts		= cpu_to_le32(ms->m_asts);
+	ms->m_result		= cpu_to_le32(ms->m_result);
+}
+
+void dlm_message_in(struct dlm_message *ms)
+{
+	struct dlm_header *hd = (struct dlm_header *) ms;
+
+	header_in(hd);
+
+	ms->m_type		= le32_to_cpu(ms->m_type);
+	ms->m_nodeid		= le32_to_cpu(ms->m_nodeid);
+	ms->m_pid		= le32_to_cpu(ms->m_pid);
+	ms->m_lkid		= le32_to_cpu(ms->m_lkid);
+	ms->m_remid		= le32_to_cpu(ms->m_remid);
+	ms->m_parent_lkid	= le32_to_cpu(ms->m_parent_lkid);
+	ms->m_parent_remid	= le32_to_cpu(ms->m_parent_remid);
+	ms->m_exflags		= le32_to_cpu(ms->m_exflags);
+	ms->m_sbflags		= le32_to_cpu(ms->m_sbflags);
+	ms->m_flags		= le32_to_cpu(ms->m_flags);
+	ms->m_lvbseq		= le32_to_cpu(ms->m_lvbseq);
+	ms->m_hash		= le32_to_cpu(ms->m_hash);
+	ms->m_status		= le32_to_cpu(ms->m_status);
+	ms->m_grmode		= le32_to_cpu(ms->m_grmode);
+	ms->m_rqmode		= le32_to_cpu(ms->m_rqmode);
+	ms->m_bastmode		= le32_to_cpu(ms->m_bastmode);
+	ms->m_asts		= le32_to_cpu(ms->m_asts);
+	ms->m_result		= le32_to_cpu(ms->m_result);
+}
+
+static void rcom_lock_out(struct rcom_lock *rl)
+{
+	rl->rl_ownpid		= cpu_to_le32(rl->rl_ownpid);
+	rl->rl_lkid		= cpu_to_le32(rl->rl_lkid);
+	rl->rl_remid		= cpu_to_le32(rl->rl_remid);
+	rl->rl_parent_lkid	= cpu_to_le32(rl->rl_parent_lkid);
+	rl->rl_parent_remid	= cpu_to_le32(rl->rl_parent_remid);
+	rl->rl_exflags		= cpu_to_le32(rl->rl_exflags);
+	rl->rl_flags		= cpu_to_le32(rl->rl_flags);
+	rl->rl_lvbseq		= cpu_to_le32(rl->rl_lvbseq);
+	rl->rl_result		= cpu_to_le32(rl->rl_result);
+	rl->rl_wait_type	= cpu_to_le16(rl->rl_wait_type);
+	rl->rl_namelen		= cpu_to_le16(rl->rl_namelen);
+}
+
+static void rcom_lock_in(struct rcom_lock *rl)
+{
+	rl->rl_ownpid		= le32_to_cpu(rl->rl_ownpid);
+	rl->rl_lkid		= le32_to_cpu(rl->rl_lkid);
+	rl->rl_remid		= le32_to_cpu(rl->rl_remid);
+	rl->rl_parent_lkid	= le32_to_cpu(rl->rl_parent_lkid);
+	rl->rl_parent_remid	= le32_to_cpu(rl->rl_parent_remid);
+	rl->rl_exflags		= le32_to_cpu(rl->rl_exflags);
+	rl->rl_flags		= le32_to_cpu(rl->rl_flags);
+	rl->rl_lvbseq		= le32_to_cpu(rl->rl_lvbseq);
+	rl->rl_result		= le32_to_cpu(rl->rl_result);
+	rl->rl_wait_type	= le16_to_cpu(rl->rl_wait_type);
+	rl->rl_namelen		= le16_to_cpu(rl->rl_namelen);
+}
+
+static void rcom_config_out(struct rcom_config *rf)
+{
+	rf->rf_lvblen		= cpu_to_le32(rf->rf_lvblen);
+	rf->rf_lsflags		= cpu_to_le32(rf->rf_lsflags);
+}
+
+static void rcom_config_in(struct rcom_config *rf)
+{
+	rf->rf_lvblen		= le32_to_cpu(rf->rf_lvblen);
+	rf->rf_lsflags		= le32_to_cpu(rf->rf_lsflags);
+}
+
+void dlm_rcom_out(struct dlm_rcom *rc)
+{
+	struct dlm_header *hd = (struct dlm_header *) rc;
+	int type = rc->rc_type;
+
+	header_out(hd);
+
+	rc->rc_type		= cpu_to_le32(rc->rc_type);
+	rc->rc_result		= cpu_to_le32(rc->rc_result);
+	rc->rc_id		= cpu_to_le64(rc->rc_id);
+
+	if (type == DLM_RCOM_LOCK)
+		rcom_lock_out((struct rcom_lock *) rc->rc_buf);
+
+	else if (type == DLM_RCOM_STATUS_REPLY)
+		rcom_config_out((struct rcom_config *) rc->rc_buf);
+}
+
+void dlm_rcom_in(struct dlm_rcom *rc)
+{
+	struct dlm_header *hd = (struct dlm_header *) rc;
+
+	header_in(hd);
+
+	rc->rc_type		= le32_to_cpu(rc->rc_type);
+	rc->rc_result		= le32_to_cpu(rc->rc_result);
+	rc->rc_id		= le64_to_cpu(rc->rc_id);
+
+	if (rc->rc_type == DLM_RCOM_LOCK)
+		rcom_lock_in((struct rcom_lock *) rc->rc_buf);
+
+	else if (rc->rc_type == DLM_RCOM_STATUS_REPLY)
+		rcom_config_in((struct rcom_config *) rc->rc_buf);
+}
+
diff --git a/fs/dlm/util.h b/fs/dlm/util.h
new file mode 100644
index 000000000000..2b25915161c0
--- /dev/null
+++ b/fs/dlm/util.h
@@ -0,0 +1,22 @@
+/******************************************************************************
+*******************************************************************************
+**
+**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**
+**  This copyrighted material is made available to anyone wishing to use,
+**  modify, copy, or redistribute it subject to the terms and conditions
+**  of the GNU General Public License v.2.
+**
+*******************************************************************************
+******************************************************************************/
+
+#ifndef __UTIL_DOT_H__
+#define __UTIL_DOT_H__
+
+void dlm_message_out(struct dlm_message *ms);
+void dlm_message_in(struct dlm_message *ms);
+void dlm_rcom_out(struct dlm_rcom *rc);
+void dlm_rcom_in(struct dlm_rcom *rc);
+
+#endif
+
diff --git a/fs/dnotify.c b/fs/dnotify.c
index f932591df5a4..2b0442db67e0 100644
--- a/fs/dnotify.c
+++ b/fs/dnotify.c
@@ -92,7 +92,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 		prev = &odn->dn_next;
 	}
 
-	error = f_setown(filp, current->pid, 0);
+	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
 	if (error)
 		goto out_free;
 
diff --git a/fs/dquot.c b/fs/dquot.c
index 0122a279106a..9af789567e51 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -834,6 +834,9 @@ static void print_warning(struct dquot *dquot, const char warntype)
 	if (!need_print_warning(dquot) || (flag && test_and_set_bit(flag, &dquot->dq_flags)))
 		return;
 
+	mutex_lock(&tty_mutex);
+	if (!current->signal->tty)
+		goto out_lock;
 	tty_write_message(current->signal->tty, dquot->dq_sb->s_id);
 	if (warntype == ISOFTWARN || warntype == BSOFTWARN)
 		tty_write_message(current->signal->tty, ": warning, ");
@@ -861,6 +864,8 @@ static void print_warning(struct dquot *dquot, const char warntype)
 			break;
 	}
 	tty_write_message(current->signal->tty, msg);
+out_lock:
+	mutex_unlock(&tty_mutex);
 }
 
 static inline void flush_warnings(struct dquot **dquots, char *warntype)
diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile
new file mode 100644
index 000000000000..ca6562451eeb
--- /dev/null
+++ b/fs/ecryptfs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the Linux 2.6 eCryptfs
+#
+
+obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o
+
+ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o crypto.o keystore.o debug.o
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
new file mode 100644
index 000000000000..ed35a9712fa1
--- /dev/null
+++ b/fs/ecryptfs/crypto.c
@@ -0,0 +1,1659 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2004 Erez Zadok
+ * Copyright (C) 2001-2004 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *   		Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/random.h>
+#include <linux/compiler.h>
+#include <linux/key.h>
+#include <linux/namei.h>
+#include <linux/crypto.h>
+#include <linux/file.h>
+#include <linux/scatterlist.h>
+#include "ecryptfs_kernel.h"
+
+static int
+ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
+			     struct page *dst_page, int dst_offset,
+			     struct page *src_page, int src_offset, int size,
+			     unsigned char *iv);
+static int
+ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
+			     struct page *dst_page, int dst_offset,
+			     struct page *src_page, int src_offset, int size,
+			     unsigned char *iv);
+
+/**
+ * ecryptfs_to_hex
+ * @dst: Buffer to take hex character representation of contents of
+ *       src; must be at least of size (src_size * 2)
+ * @src: Buffer to be converted to a hex string respresentation
+ * @src_size: number of bytes to convert
+ */
+void ecryptfs_to_hex(char *dst, char *src, size_t src_size)
+{
+	int x;
+
+	for (x = 0; x < src_size; x++)
+		sprintf(&dst[x * 2], "%.2x", (unsigned char)src[x]);
+}
+
+/**
+ * ecryptfs_from_hex
+ * @dst: Buffer to take the bytes from src hex; must be at least of
+ *       size (src_size / 2)
+ * @src: Buffer to be converted from a hex string respresentation to raw value
+ * @dst_size: size of dst buffer, or number of hex characters pairs to convert
+ */
+void ecryptfs_from_hex(char *dst, char *src, int dst_size)
+{
+	int x;
+	char tmp[3] = { 0, };
+
+	for (x = 0; x < dst_size; x++) {
+		tmp[0] = src[x * 2];
+		tmp[1] = src[x * 2 + 1];
+		dst[x] = (unsigned char)simple_strtol(tmp, NULL, 16);
+	}
+}
+
+/**
+ * ecryptfs_calculate_md5 - calculates the md5 of @src
+ * @dst: Pointer to 16 bytes of allocated memory
+ * @crypt_stat: Pointer to crypt_stat struct for the current inode
+ * @src: Data to be md5'd
+ * @len: Length of @src
+ *
+ * Uses the allocated crypto context that crypt_stat references to
+ * generate the MD5 sum of the contents of src.
+ */
+static int ecryptfs_calculate_md5(char *dst,
+				  struct ecryptfs_crypt_stat *crypt_stat,
+				  char *src, int len)
+{
+	int rc = 0;
+	struct scatterlist sg;
+
+	mutex_lock(&crypt_stat->cs_md5_tfm_mutex);
+	sg_init_one(&sg, (u8 *)src, len);
+	if (!crypt_stat->md5_tfm) {
+		crypt_stat->md5_tfm =
+			crypto_alloc_tfm("md5", CRYPTO_TFM_REQ_MAY_SLEEP);
+		if (!crypt_stat->md5_tfm) {
+			rc = -ENOMEM;
+			ecryptfs_printk(KERN_ERR, "Error attempting to "
+					"allocate crypto context\n");
+			goto out;
+		}
+	}
+	crypto_digest_init(crypt_stat->md5_tfm);
+	crypto_digest_update(crypt_stat->md5_tfm, &sg, 1);
+	crypto_digest_final(crypt_stat->md5_tfm, dst);
+	mutex_unlock(&crypt_stat->cs_md5_tfm_mutex);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_derive_iv
+ * @iv: destination for the derived iv vale
+ * @crypt_stat: Pointer to crypt_stat struct for the current inode
+ * @offset: Offset of the page whose's iv we are to derive
+ *
+ * Generate the initialization vector from the given root IV and page
+ * offset.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+			      pgoff_t offset)
+{
+	int rc = 0;
+	char dst[MD5_DIGEST_SIZE];
+	char src[ECRYPTFS_MAX_IV_BYTES + 16];
+
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "root iv:\n");
+		ecryptfs_dump_hex(crypt_stat->root_iv, crypt_stat->iv_bytes);
+	}
+	/* TODO: It is probably secure to just cast the least
+	 * significant bits of the root IV into an unsigned long and
+	 * add the offset to that rather than go through all this
+	 * hashing business. -Halcrow */
+	memcpy(src, crypt_stat->root_iv, crypt_stat->iv_bytes);
+	memset((src + crypt_stat->iv_bytes), 0, 16);
+	snprintf((src + crypt_stat->iv_bytes), 16, "%ld", offset);
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "source:\n");
+		ecryptfs_dump_hex(src, (crypt_stat->iv_bytes + 16));
+	}
+	rc = ecryptfs_calculate_md5(dst, crypt_stat, src,
+				    (crypt_stat->iv_bytes + 16));
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error attempting to compute "
+				"MD5 while generating IV for a page\n");
+		goto out;
+	}
+	memcpy(iv, dst, crypt_stat->iv_bytes);
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "derived iv:\n");
+		ecryptfs_dump_hex(iv, crypt_stat->iv_bytes);
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_init_crypt_stat
+ * @crypt_stat: Pointer to the crypt_stat struct to initialize.
+ *
+ * Initialize the crypt_stat structure.
+ */
+void
+ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	memset((void *)crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
+	mutex_init(&crypt_stat->cs_mutex);
+	mutex_init(&crypt_stat->cs_tfm_mutex);
+	mutex_init(&crypt_stat->cs_md5_tfm_mutex);
+	ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_STRUCT_INITIALIZED);
+}
+
+/**
+ * ecryptfs_destruct_crypt_stat
+ * @crypt_stat: Pointer to the crypt_stat struct to initialize.
+ *
+ * Releases all memory associated with a crypt_stat struct.
+ */
+void ecryptfs_destruct_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	if (crypt_stat->tfm)
+		crypto_free_tfm(crypt_stat->tfm);
+	if (crypt_stat->md5_tfm)
+		crypto_free_tfm(crypt_stat->md5_tfm);
+	memset(crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
+}
+
+void ecryptfs_destruct_mount_crypt_stat(
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+	if (mount_crypt_stat->global_auth_tok_key)
+		key_put(mount_crypt_stat->global_auth_tok_key);
+	if (mount_crypt_stat->global_key_tfm)
+		crypto_free_tfm(mount_crypt_stat->global_key_tfm);
+	memset(mount_crypt_stat, 0, sizeof(struct ecryptfs_mount_crypt_stat));
+}
+
+/**
+ * virt_to_scatterlist
+ * @addr: Virtual address
+ * @size: Size of data; should be an even multiple of the block size
+ * @sg: Pointer to scatterlist array; set to NULL to obtain only
+ *      the number of scatterlist structs required in array
+ * @sg_size: Max array size
+ *
+ * Fills in a scatterlist array with page references for a passed
+ * virtual address.
+ *
+ * Returns the number of scatterlist structs in array used
+ */
+int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
+			int sg_size)
+{
+	int i = 0;
+	struct page *pg;
+	int offset;
+	int remainder_of_page;
+
+	while (size > 0 && i < sg_size) {
+		pg = virt_to_page(addr);
+		offset = offset_in_page(addr);
+		if (sg) {
+			sg[i].page = pg;
+			sg[i].offset = offset;
+		}
+		remainder_of_page = PAGE_CACHE_SIZE - offset;
+		if (size >= remainder_of_page) {
+			if (sg)
+				sg[i].length = remainder_of_page;
+			addr += remainder_of_page;
+			size -= remainder_of_page;
+		} else {
+			if (sg)
+				sg[i].length = size;
+			addr += size;
+			size = 0;
+		}
+		i++;
+	}
+	if (size > 0)
+		return -ENOMEM;
+	return i;
+}
+
+/**
+ * encrypt_scatterlist
+ * @crypt_stat: Pointer to the crypt_stat struct to initialize.
+ * @dest_sg: Destination of encrypted data
+ * @src_sg: Data to be encrypted
+ * @size: Length of data to be encrypted
+ * @iv: iv to use during encryption
+ *
+ * Returns the number of bytes encrypted; negative value on error
+ */
+static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
+			       struct scatterlist *dest_sg,
+			       struct scatterlist *src_sg, int size,
+			       unsigned char *iv)
+{
+	int rc = 0;
+
+	BUG_ON(!crypt_stat || !crypt_stat->tfm
+	       || !ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
+				       ECRYPTFS_STRUCT_INITIALIZED));
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "Key size [%d]; key:\n",
+				crypt_stat->key_size);
+		ecryptfs_dump_hex(crypt_stat->key,
+				  crypt_stat->key_size);
+	}
+	/* Consider doing this once, when the file is opened */
+	mutex_lock(&crypt_stat->cs_tfm_mutex);
+	rc = crypto_cipher_setkey(crypt_stat->tfm, crypt_stat->key,
+				  crypt_stat->key_size);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n",
+				rc);
+		mutex_unlock(&crypt_stat->cs_tfm_mutex);
+		rc = -EINVAL;
+		goto out;
+	}
+	ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes.\n", size);
+	crypto_cipher_encrypt_iv(crypt_stat->tfm, dest_sg, src_sg, size, iv);
+	mutex_unlock(&crypt_stat->cs_tfm_mutex);
+out:
+	return rc;
+}
+
+static void
+ecryptfs_extent_to_lwr_pg_idx_and_offset(unsigned long *lower_page_idx,
+					 int *byte_offset,
+					 struct ecryptfs_crypt_stat *crypt_stat,
+					 unsigned long extent_num)
+{
+	unsigned long lower_extent_num;
+	int extents_occupied_by_headers_at_front;
+	int bytes_occupied_by_headers_at_front;
+	int extent_offset;
+	int extents_per_page;
+
+	bytes_occupied_by_headers_at_front =
+		( crypt_stat->header_extent_size
+		  * crypt_stat->num_header_extents_at_front );
+	extents_occupied_by_headers_at_front =
+		( bytes_occupied_by_headers_at_front
+		  / crypt_stat->extent_size );
+	lower_extent_num = extents_occupied_by_headers_at_front + extent_num;
+	extents_per_page = PAGE_CACHE_SIZE / crypt_stat->extent_size;
+	(*lower_page_idx) = lower_extent_num / extents_per_page;
+	extent_offset = lower_extent_num % extents_per_page;
+	(*byte_offset) = extent_offset * crypt_stat->extent_size;
+	ecryptfs_printk(KERN_DEBUG, " * crypt_stat->header_extent_size = "
+			"[%d]\n", crypt_stat->header_extent_size);
+	ecryptfs_printk(KERN_DEBUG, " * crypt_stat->"
+			"num_header_extents_at_front = [%d]\n",
+			crypt_stat->num_header_extents_at_front);
+	ecryptfs_printk(KERN_DEBUG, " * extents_occupied_by_headers_at_"
+			"front = [%d]\n", extents_occupied_by_headers_at_front);
+	ecryptfs_printk(KERN_DEBUG, " * lower_extent_num = [0x%.16x]\n",
+			lower_extent_num);
+	ecryptfs_printk(KERN_DEBUG, " * extents_per_page = [%d]\n",
+			extents_per_page);
+	ecryptfs_printk(KERN_DEBUG, " * (*lower_page_idx) = [0x%.16x]\n",
+			(*lower_page_idx));
+	ecryptfs_printk(KERN_DEBUG, " * extent_offset = [%d]\n",
+			extent_offset);
+	ecryptfs_printk(KERN_DEBUG, " * (*byte_offset) = [%d]\n",
+			(*byte_offset));
+}
+
+static int ecryptfs_write_out_page(struct ecryptfs_page_crypt_context *ctx,
+				   struct page *lower_page,
+				   struct inode *lower_inode,
+				   int byte_offset_in_page, int bytes_to_write)
+{
+	int rc = 0;
+
+	if (ctx->mode == ECRYPTFS_PREPARE_COMMIT_MODE) {
+		rc = ecryptfs_commit_lower_page(lower_page, lower_inode,
+						ctx->param.lower_file,
+						byte_offset_in_page,
+						bytes_to_write);
+		if (rc) {
+			ecryptfs_printk(KERN_ERR, "Error calling lower "
+					"commit; rc = [%d]\n", rc);
+			goto out;
+		}
+	} else {
+		rc = ecryptfs_writepage_and_release_lower_page(lower_page,
+							       lower_inode,
+							       ctx->param.wbc);
+		if (rc) {
+			ecryptfs_printk(KERN_ERR, "Error calling lower "
+					"writepage(); rc = [%d]\n", rc);
+			goto out;
+		}
+	}
+out:
+	return rc;
+}
+
+static int ecryptfs_read_in_page(struct ecryptfs_page_crypt_context *ctx,
+				 struct page **lower_page,
+				 struct inode *lower_inode,
+				 unsigned long lower_page_idx,
+				 int byte_offset_in_page)
+{
+	int rc = 0;
+
+	if (ctx->mode == ECRYPTFS_PREPARE_COMMIT_MODE) {
+		/* TODO: Limit this to only the data extents that are
+		 * needed */
+		rc = ecryptfs_get_lower_page(lower_page, lower_inode,
+					     ctx->param.lower_file,
+					     lower_page_idx,
+					     byte_offset_in_page,
+					     (PAGE_CACHE_SIZE
+					      - byte_offset_in_page));
+		if (rc) {
+			ecryptfs_printk(
+				KERN_ERR, "Error attempting to grab, map, "
+				"and prepare_write lower page with index "
+				"[0x%.16x]; rc = [%d]\n", lower_page_idx, rc);
+			goto out;
+		}
+	} else {
+		rc = ecryptfs_grab_and_map_lower_page(lower_page, NULL,
+						      lower_inode,
+						      lower_page_idx);
+		if (rc) {
+			ecryptfs_printk(
+				KERN_ERR, "Error attempting to grab and map "
+				"lower page with index [0x%.16x]; rc = [%d]\n",
+				lower_page_idx, rc);
+			goto out;
+		}
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_encrypt_page
+ * @ctx: The context of the page
+ *
+ * Encrypt an eCryptfs page. This is done on a per-extent basis. Note
+ * that eCryptfs pages may straddle the lower pages -- for instance,
+ * if the file was created on a machine with an 8K page size
+ * (resulting in an 8K header), and then the file is copied onto a
+ * host with a 32K page size, then when reading page 0 of the eCryptfs
+ * file, 24K of page 0 of the lower file will be read and decrypted,
+ * and then 8K of page 1 of the lower file will be read and decrypted.
+ *
+ * The actual operations performed on each page depends on the
+ * contents of the ecryptfs_page_crypt_context struct.
+ *
+ * Returns zero on success; negative on error
+ */
+int ecryptfs_encrypt_page(struct ecryptfs_page_crypt_context *ctx)
+{
+	char extent_iv[ECRYPTFS_MAX_IV_BYTES];
+	unsigned long base_extent;
+	unsigned long extent_offset = 0;
+	unsigned long lower_page_idx = 0;
+	unsigned long prior_lower_page_idx = 0;
+	struct page *lower_page;
+	struct inode *lower_inode;
+	struct ecryptfs_inode_info *inode_info;
+	struct ecryptfs_crypt_stat *crypt_stat;
+	int rc = 0;
+	int lower_byte_offset = 0;
+	int orig_byte_offset = 0;
+	int num_extents_per_page;
+#define ECRYPTFS_PAGE_STATE_UNREAD    0
+#define ECRYPTFS_PAGE_STATE_READ      1
+#define ECRYPTFS_PAGE_STATE_MODIFIED  2
+#define ECRYPTFS_PAGE_STATE_WRITTEN   3
+	int page_state;
+
+	lower_inode = ecryptfs_inode_to_lower(ctx->page->mapping->host);
+	inode_info = ecryptfs_inode_to_private(ctx->page->mapping->host);
+	crypt_stat = &inode_info->crypt_stat;
+	if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED)) {
+		rc = ecryptfs_copy_page_to_lower(ctx->page, lower_inode,
+						 ctx->param.lower_file);
+		if (rc)
+			ecryptfs_printk(KERN_ERR, "Error attempting to copy "
+					"page at index [0x%.16x]\n",
+					ctx->page->index);
+		goto out;
+	}
+	num_extents_per_page = PAGE_CACHE_SIZE / crypt_stat->extent_size;
+	base_extent = (ctx->page->index * num_extents_per_page);
+	page_state = ECRYPTFS_PAGE_STATE_UNREAD;
+	while (extent_offset < num_extents_per_page) {
+		ecryptfs_extent_to_lwr_pg_idx_and_offset(
+			&lower_page_idx, &lower_byte_offset, crypt_stat,
+			(base_extent + extent_offset));
+		if (prior_lower_page_idx != lower_page_idx
+		    && page_state == ECRYPTFS_PAGE_STATE_MODIFIED) {
+			rc = ecryptfs_write_out_page(ctx, lower_page,
+						     lower_inode,
+						     orig_byte_offset,
+						     (PAGE_CACHE_SIZE
+						      - orig_byte_offset));
+			if (rc) {
+				ecryptfs_printk(KERN_ERR, "Error attempting "
+						"to write out page; rc = [%d]"
+						"\n", rc);
+				goto out;
+			}
+			page_state = ECRYPTFS_PAGE_STATE_WRITTEN;
+		}
+		if (page_state == ECRYPTFS_PAGE_STATE_UNREAD
+		    || page_state == ECRYPTFS_PAGE_STATE_WRITTEN) {
+			rc = ecryptfs_read_in_page(ctx, &lower_page,
+						   lower_inode, lower_page_idx,
+						   lower_byte_offset);
+			if (rc) {
+				ecryptfs_printk(KERN_ERR, "Error attempting "
+						"to read in lower page with "
+						"index [0x%.16x]; rc = [%d]\n",
+						lower_page_idx, rc);
+				goto out;
+			}
+			orig_byte_offset = lower_byte_offset;
+			prior_lower_page_idx = lower_page_idx;
+			page_state = ECRYPTFS_PAGE_STATE_READ;
+		}
+		BUG_ON(!(page_state == ECRYPTFS_PAGE_STATE_MODIFIED
+			 || page_state == ECRYPTFS_PAGE_STATE_READ));
+		rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
+					(base_extent + extent_offset));
+		if (rc) {
+			ecryptfs_printk(KERN_ERR, "Error attempting to "
+					"derive IV for extent [0x%.16x]; "
+					"rc = [%d]\n",
+					(base_extent + extent_offset), rc);
+			goto out;
+		}
+		if (unlikely(ecryptfs_verbosity > 0)) {
+			ecryptfs_printk(KERN_DEBUG, "Encrypting extent "
+					"with iv:\n");
+			ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
+			ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
+					"encryption:\n");
+			ecryptfs_dump_hex((char *)
+					  (page_address(ctx->page)
+					   + (extent_offset
+					      * crypt_stat->extent_size)), 8);
+		}
+		rc = ecryptfs_encrypt_page_offset(
+			crypt_stat, lower_page, lower_byte_offset, ctx->page,
+			(extent_offset * crypt_stat->extent_size),
+			crypt_stat->extent_size, extent_iv);
+		ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16x]; "
+				"rc = [%d]\n",
+				(base_extent + extent_offset), rc);
+		if (unlikely(ecryptfs_verbosity > 0)) {
+			ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
+					"encryption:\n");
+			ecryptfs_dump_hex((char *)(page_address(lower_page)
+						   + lower_byte_offset), 8);
+		}
+		page_state = ECRYPTFS_PAGE_STATE_MODIFIED;
+		extent_offset++;
+	}
+	BUG_ON(orig_byte_offset != 0);
+	rc = ecryptfs_write_out_page(ctx, lower_page, lower_inode, 0,
+				     (lower_byte_offset
+				      + crypt_stat->extent_size));
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error attempting to write out "
+				"page; rc = [%d]\n", rc);
+				goto out;
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_decrypt_page
+ * @file: The ecryptfs file
+ * @page: The page in ecryptfs to decrypt
+ *
+ * Decrypt an eCryptfs page. This is done on a per-extent basis. Note
+ * that eCryptfs pages may straddle the lower pages -- for instance,
+ * if the file was created on a machine with an 8K page size
+ * (resulting in an 8K header), and then the file is copied onto a
+ * host with a 32K page size, then when reading page 0 of the eCryptfs
+ * file, 24K of page 0 of the lower file will be read and decrypted,
+ * and then 8K of page 1 of the lower file will be read and decrypted.
+ *
+ * Returns zero on success; negative on error
+ */
+int ecryptfs_decrypt_page(struct file *file, struct page *page)
+{
+	char extent_iv[ECRYPTFS_MAX_IV_BYTES];
+	unsigned long base_extent;
+	unsigned long extent_offset = 0;
+	unsigned long lower_page_idx = 0;
+	unsigned long prior_lower_page_idx = 0;
+	struct page *lower_page;
+	char *lower_page_virt = NULL;
+	struct inode *lower_inode;
+	struct ecryptfs_crypt_stat *crypt_stat;
+	int rc = 0;
+	int byte_offset;
+	int num_extents_per_page;
+	int page_state;
+
+	crypt_stat = &(ecryptfs_inode_to_private(
+			       page->mapping->host)->crypt_stat);
+	lower_inode = ecryptfs_inode_to_lower(page->mapping->host);
+	if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED)) {
+		rc = ecryptfs_do_readpage(file, page, page->index);
+		if (rc)
+			ecryptfs_printk(KERN_ERR, "Error attempting to copy "
+					"page at index [0x%.16x]\n",
+					page->index);
+		goto out;
+	}
+	num_extents_per_page = PAGE_CACHE_SIZE / crypt_stat->extent_size;
+	base_extent = (page->index * num_extents_per_page);
+	lower_page_virt = kmem_cache_alloc(ecryptfs_lower_page_cache,
+					   SLAB_KERNEL);
+	if (!lower_page_virt) {
+		rc = -ENOMEM;
+		ecryptfs_printk(KERN_ERR, "Error getting page for encrypted "
+				"lower page(s)\n");
+		goto out;
+	}
+	lower_page = virt_to_page(lower_page_virt);
+	page_state = ECRYPTFS_PAGE_STATE_UNREAD;
+	while (extent_offset < num_extents_per_page) {
+		ecryptfs_extent_to_lwr_pg_idx_and_offset(
+			&lower_page_idx, &byte_offset, crypt_stat,
+			(base_extent + extent_offset));
+		if (prior_lower_page_idx != lower_page_idx
+		    || page_state == ECRYPTFS_PAGE_STATE_UNREAD) {
+			rc = ecryptfs_do_readpage(file, lower_page,
+						  lower_page_idx);
+			if (rc) {
+				ecryptfs_printk(KERN_ERR, "Error reading "
+						"lower encrypted page; rc = "
+						"[%d]\n", rc);
+				goto out;
+			}
+			prior_lower_page_idx = lower_page_idx;
+			page_state = ECRYPTFS_PAGE_STATE_READ;
+		}
+		rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
+					(base_extent + extent_offset));
+		if (rc) {
+			ecryptfs_printk(KERN_ERR, "Error attempting to "
+					"derive IV for extent [0x%.16x]; rc = "
+					"[%d]\n",
+					(base_extent + extent_offset), rc);
+			goto out;
+		}
+		if (unlikely(ecryptfs_verbosity > 0)) {
+			ecryptfs_printk(KERN_DEBUG, "Decrypting extent "
+					"with iv:\n");
+			ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
+			ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
+					"decryption:\n");
+			ecryptfs_dump_hex((lower_page_virt + byte_offset), 8);
+		}
+		rc = ecryptfs_decrypt_page_offset(crypt_stat, page,
+						  (extent_offset
+						   * crypt_stat->extent_size),
+						  lower_page, byte_offset,
+						  crypt_stat->extent_size,
+						  extent_iv);
+		if (rc != crypt_stat->extent_size) {
+			ecryptfs_printk(KERN_ERR, "Error attempting to "
+					"decrypt extent [0x%.16x]\n",
+					(base_extent + extent_offset));
+			goto out;
+		}
+		rc = 0;
+		if (unlikely(ecryptfs_verbosity > 0)) {
+			ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
+					"decryption:\n");
+			ecryptfs_dump_hex((char *)(page_address(page)
+						   + byte_offset), 8);
+		}
+		extent_offset++;
+	}
+out:
+	if (lower_page_virt)
+		kmem_cache_free(ecryptfs_lower_page_cache, lower_page_virt);
+	return rc;
+}
+
+/**
+ * decrypt_scatterlist
+ *
+ * Returns the number of bytes decrypted; negative value on error
+ */
+static int decrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
+			       struct scatterlist *dest_sg,
+			       struct scatterlist *src_sg, int size,
+			       unsigned char *iv)
+{
+	int rc = 0;
+
+	/* Consider doing this once, when the file is opened */
+	mutex_lock(&crypt_stat->cs_tfm_mutex);
+	rc = crypto_cipher_setkey(crypt_stat->tfm, crypt_stat->key,
+				  crypt_stat->key_size);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n",
+				rc);
+		mutex_unlock(&crypt_stat->cs_tfm_mutex);
+		rc = -EINVAL;
+		goto out;
+	}
+	ecryptfs_printk(KERN_DEBUG, "Decrypting [%d] bytes.\n", size);
+	rc = crypto_cipher_decrypt_iv(crypt_stat->tfm, dest_sg, src_sg, size,
+				      iv);
+	mutex_unlock(&crypt_stat->cs_tfm_mutex);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error decrypting; rc = [%d]\n",
+				rc);
+		goto out;
+	}
+	rc = size;
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_encrypt_page_offset
+ *
+ * Returns the number of bytes encrypted
+ */
+static int
+ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
+			     struct page *dst_page, int dst_offset,
+			     struct page *src_page, int src_offset, int size,
+			     unsigned char *iv)
+{
+	struct scatterlist src_sg, dst_sg;
+
+	src_sg.page = src_page;
+	src_sg.offset = src_offset;
+	src_sg.length = size;
+	dst_sg.page = dst_page;
+	dst_sg.offset = dst_offset;
+	dst_sg.length = size;
+	return encrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
+}
+
+/**
+ * ecryptfs_decrypt_page_offset
+ *
+ * Returns the number of bytes decrypted
+ */
+static int
+ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat,
+			     struct page *dst_page, int dst_offset,
+			     struct page *src_page, int src_offset, int size,
+			     unsigned char *iv)
+{
+	struct scatterlist src_sg, dst_sg;
+
+	src_sg.page = src_page;
+	src_sg.offset = src_offset;
+	src_sg.length = size;
+	dst_sg.page = dst_page;
+	dst_sg.offset = dst_offset;
+	dst_sg.length = size;
+	return decrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv);
+}
+
+#define ECRYPTFS_MAX_SCATTERLIST_LEN 4
+
+/**
+ * ecryptfs_init_crypt_ctx
+ * @crypt_stat: Uninitilized crypt stats structure
+ *
+ * Initialize the crypto context.
+ *
+ * TODO: Performance: Keep a cache of initialized cipher contexts;
+ * only init if needed
+ */
+int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	int rc = -EINVAL;
+
+	if (!crypt_stat->cipher) {
+		ecryptfs_printk(KERN_ERR, "No cipher specified\n");
+		goto out;
+	}
+	ecryptfs_printk(KERN_DEBUG,
+			"Initializing cipher [%s]; strlen = [%d]; "
+			"key_size_bits = [%d]\n",
+			crypt_stat->cipher, (int)strlen(crypt_stat->cipher),
+			crypt_stat->key_size << 3);
+	if (crypt_stat->tfm) {
+		rc = 0;
+		goto out;
+	}
+	mutex_lock(&crypt_stat->cs_tfm_mutex);
+	crypt_stat->tfm = crypto_alloc_tfm(crypt_stat->cipher,
+					   ECRYPTFS_DEFAULT_CHAINING_MODE
+					   | CRYPTO_TFM_REQ_WEAK_KEY);
+	mutex_unlock(&crypt_stat->cs_tfm_mutex);
+	if (!crypt_stat->tfm) {
+		ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): "
+				"Error initializing cipher [%s]\n",
+				crypt_stat->cipher);
+		goto out;
+	}
+	rc = 0;
+out:
+	return rc;
+}
+
+static void set_extent_mask_and_shift(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	int extent_size_tmp;
+
+	crypt_stat->extent_mask = 0xFFFFFFFF;
+	crypt_stat->extent_shift = 0;
+	if (crypt_stat->extent_size == 0)
+		return;
+	extent_size_tmp = crypt_stat->extent_size;
+	while ((extent_size_tmp & 0x01) == 0) {
+		extent_size_tmp >>= 1;
+		crypt_stat->extent_mask <<= 1;
+		crypt_stat->extent_shift++;
+	}
+}
+
+void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	/* Default values; may be overwritten as we are parsing the
+	 * packets. */
+	crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
+	set_extent_mask_and_shift(crypt_stat);
+	crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES;
+	if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) {
+		crypt_stat->header_extent_size =
+			ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
+	} else
+		crypt_stat->header_extent_size = PAGE_CACHE_SIZE;
+	crypt_stat->num_header_extents_at_front = 1;
+}
+
+/**
+ * ecryptfs_compute_root_iv
+ * @crypt_stats
+ *
+ * On error, sets the root IV to all 0's.
+ */
+int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	int rc = 0;
+	char dst[MD5_DIGEST_SIZE];
+
+	BUG_ON(crypt_stat->iv_bytes > MD5_DIGEST_SIZE);
+	BUG_ON(crypt_stat->iv_bytes <= 0);
+	if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID)) {
+		rc = -EINVAL;
+		ecryptfs_printk(KERN_WARNING, "Session key not valid; "
+				"cannot generate root IV\n");
+		goto out;
+	}
+	rc = ecryptfs_calculate_md5(dst, crypt_stat, crypt_stat->key,
+				    crypt_stat->key_size);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error attempting to compute "
+				"MD5 while generating root IV\n");
+		goto out;
+	}
+	memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes);
+out:
+	if (rc) {
+		memset(crypt_stat->root_iv, 0, crypt_stat->iv_bytes);
+		ECRYPTFS_SET_FLAG(crypt_stat->flags,
+				  ECRYPTFS_SECURITY_WARNING);
+	}
+	return rc;
+}
+
+static void ecryptfs_generate_new_key(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	get_random_bytes(crypt_stat->key, crypt_stat->key_size);
+	ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID);
+	ecryptfs_compute_root_iv(crypt_stat);
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "Generated new session key:\n");
+		ecryptfs_dump_hex(crypt_stat->key,
+				  crypt_stat->key_size);
+	}
+}
+
+/**
+ * ecryptfs_set_default_crypt_stat_vals
+ * @crypt_stat
+ *
+ * Default values in the event that policy does not override them.
+ */
+static void ecryptfs_set_default_crypt_stat_vals(
+	struct ecryptfs_crypt_stat *crypt_stat,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+	ecryptfs_set_default_sizes(crypt_stat);
+	strcpy(crypt_stat->cipher, ECRYPTFS_DEFAULT_CIPHER);
+	crypt_stat->key_size = ECRYPTFS_DEFAULT_KEY_BYTES;
+	ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID);
+	crypt_stat->file_version = ECRYPTFS_FILE_VERSION;
+	crypt_stat->mount_crypt_stat = mount_crypt_stat;
+}
+
+/**
+ * ecryptfs_new_file_context
+ * @ecryptfs_dentry
+ *
+ * If the crypto context for the file has not yet been established,
+ * this is where we do that.  Establishing a new crypto context
+ * involves the following decisions:
+ *  - What cipher to use?
+ *  - What set of authentication tokens to use?
+ * Here we just worry about getting enough information into the
+ * authentication tokens so that we know that they are available.
+ * We associate the available authentication tokens with the new file
+ * via the set of signatures in the crypt_stat struct.  Later, when
+ * the headers are actually written out, we may again defer to
+ * userspace to perform the encryption of the session key; for the
+ * foreseeable future, this will be the case with public key packets.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+/* Associate an authentication token(s) with the file */
+int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry)
+{
+	int rc = 0;
+	struct ecryptfs_crypt_stat *crypt_stat =
+	    &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+	    &ecryptfs_superblock_to_private(
+		    ecryptfs_dentry->d_sb)->mount_crypt_stat;
+	int cipher_name_len;
+
+	ecryptfs_set_default_crypt_stat_vals(crypt_stat, mount_crypt_stat);
+	/* See if there are mount crypt options */
+	if (mount_crypt_stat->global_auth_tok) {
+		ecryptfs_printk(KERN_DEBUG, "Initializing context for new "
+				"file using mount_crypt_stat\n");
+		ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED);
+		ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID);
+		memcpy(crypt_stat->keysigs[crypt_stat->num_keysigs++],
+		       mount_crypt_stat->global_auth_tok_sig,
+		       ECRYPTFS_SIG_SIZE_HEX);
+		cipher_name_len =
+		    strlen(mount_crypt_stat->global_default_cipher_name);
+		memcpy(crypt_stat->cipher,
+		       mount_crypt_stat->global_default_cipher_name,
+		       cipher_name_len);
+		crypt_stat->cipher[cipher_name_len] = '\0';
+		crypt_stat->key_size =
+			mount_crypt_stat->global_default_cipher_key_size;
+		ecryptfs_generate_new_key(crypt_stat);
+	} else
+		/* We should not encounter this scenario since we
+		 * should detect lack of global_auth_tok at mount time
+		 * TODO: Applies to 0.1 release only; remove in future
+		 * release */
+		BUG();
+	rc = ecryptfs_init_crypt_ctx(crypt_stat);
+	if (rc)
+		ecryptfs_printk(KERN_ERR, "Error initializing cryptographic "
+				"context for cipher [%s]: rc = [%d]\n",
+				crypt_stat->cipher, rc);
+	return rc;
+}
+
+/**
+ * contains_ecryptfs_marker - check for the ecryptfs marker
+ * @data: The data block in which to check
+ *
+ * Returns one if marker found; zero if not found
+ */
+int contains_ecryptfs_marker(char *data)
+{
+	u32 m_1, m_2;
+
+	memcpy(&m_1, data, 4);
+	m_1 = be32_to_cpu(m_1);
+	memcpy(&m_2, (data + 4), 4);
+	m_2 = be32_to_cpu(m_2);
+	if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2)
+		return 1;
+	ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; "
+			"MAGIC_ECRYPTFS_MARKER = [0x%.8x]\n", m_1, m_2,
+			MAGIC_ECRYPTFS_MARKER);
+	ecryptfs_printk(KERN_DEBUG, "(m_1 ^ MAGIC_ECRYPTFS_MARKER) = "
+			"[0x%.8x]\n", (m_1 ^ MAGIC_ECRYPTFS_MARKER));
+	return 0;
+}
+
+struct ecryptfs_flag_map_elem {
+	u32 file_flag;
+	u32 local_flag;
+};
+
+/* Add support for additional flags by adding elements here. */
+static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = {
+	{0x00000001, ECRYPTFS_ENABLE_HMAC},
+	{0x00000002, ECRYPTFS_ENCRYPTED}
+};
+
+/**
+ * ecryptfs_process_flags
+ * @crypt_stat
+ * @page_virt: Source data to be parsed
+ * @bytes_read: Updated with the number of bytes read
+ *
+ * Returns zero on success; non-zero if the flag set is invalid
+ */
+static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat,
+				  char *page_virt, int *bytes_read)
+{
+	int rc = 0;
+	int i;
+	u32 flags;
+
+	memcpy(&flags, page_virt, 4);
+	flags = be32_to_cpu(flags);
+	for (i = 0; i < ((sizeof(ecryptfs_flag_map)
+			  / sizeof(struct ecryptfs_flag_map_elem))); i++)
+		if (flags & ecryptfs_flag_map[i].file_flag) {
+			ECRYPTFS_SET_FLAG(crypt_stat->flags,
+					  ecryptfs_flag_map[i].local_flag);
+		} else
+			ECRYPTFS_CLEAR_FLAG(crypt_stat->flags,
+					    ecryptfs_flag_map[i].local_flag);
+	/* Version is in top 8 bits of the 32-bit flag vector */
+	crypt_stat->file_version = ((flags >> 24) & 0xFF);
+	(*bytes_read) = 4;
+	return rc;
+}
+
+/**
+ * write_ecryptfs_marker
+ * @page_virt: The pointer to in a page to begin writing the marker
+ * @written: Number of bytes written
+ *
+ * Marker = 0x3c81b7f5
+ */
+static void write_ecryptfs_marker(char *page_virt, size_t *written)
+{
+	u32 m_1, m_2;
+
+	get_random_bytes(&m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
+	m_2 = (m_1 ^ MAGIC_ECRYPTFS_MARKER);
+	m_1 = cpu_to_be32(m_1);
+	memcpy(page_virt, &m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
+	m_2 = cpu_to_be32(m_2);
+	memcpy(page_virt + (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2), &m_2,
+	       (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
+	(*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
+}
+
+static void
+write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat,
+		     size_t *written)
+{
+	u32 flags = 0;
+	int i;
+
+	for (i = 0; i < ((sizeof(ecryptfs_flag_map)
+			  / sizeof(struct ecryptfs_flag_map_elem))); i++)
+		if (ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
+					ecryptfs_flag_map[i].local_flag))
+			flags |= ecryptfs_flag_map[i].file_flag;
+	/* Version is in top 8 bits of the 32-bit flag vector */
+	flags |= ((((u8)crypt_stat->file_version) << 24) & 0xFF000000);
+	flags = cpu_to_be32(flags);
+	memcpy(page_virt, &flags, 4);
+	(*written) = 4;
+}
+
+struct ecryptfs_cipher_code_str_map_elem {
+	char cipher_str[16];
+	u16 cipher_code;
+};
+
+/* Add support for additional ciphers by adding elements here. The
+ * cipher_code is whatever OpenPGP applicatoins use to identify the
+ * ciphers. List in order of probability. */
+static struct ecryptfs_cipher_code_str_map_elem
+ecryptfs_cipher_code_str_map[] = {
+	{"aes",RFC2440_CIPHER_AES_128 },
+	{"blowfish", RFC2440_CIPHER_BLOWFISH},
+	{"des3_ede", RFC2440_CIPHER_DES3_EDE},
+	{"cast5", RFC2440_CIPHER_CAST_5},
+	{"twofish", RFC2440_CIPHER_TWOFISH},
+	{"cast6", RFC2440_CIPHER_CAST_6},
+	{"aes", RFC2440_CIPHER_AES_192},
+	{"aes", RFC2440_CIPHER_AES_256}
+};
+
+/**
+ * ecryptfs_code_for_cipher_string
+ * @str: The string representing the cipher name
+ *
+ * Returns zero on no match, or the cipher code on match
+ */
+u16 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	int i;
+	u16 code = 0;
+	struct ecryptfs_cipher_code_str_map_elem *map =
+		ecryptfs_cipher_code_str_map;
+
+	if (strcmp(crypt_stat->cipher, "aes") == 0) {
+		switch (crypt_stat->key_size) {
+		case 16:
+			code = RFC2440_CIPHER_AES_128;
+			break;
+		case 24:
+			code = RFC2440_CIPHER_AES_192;
+			break;
+		case 32:
+			code = RFC2440_CIPHER_AES_256;
+		}
+	} else {
+		for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
+			if (strcmp(crypt_stat->cipher, map[i].cipher_str) == 0){
+				code = map[i].cipher_code;
+				break;
+			}
+	}
+	return code;
+}
+
+/**
+ * ecryptfs_cipher_code_to_string
+ * @str: Destination to write out the cipher name
+ * @cipher_code: The code to convert to cipher name string
+ *
+ * Returns zero on success
+ */
+int ecryptfs_cipher_code_to_string(char *str, u16 cipher_code)
+{
+	int rc = 0;
+	int i;
+
+	str[0] = '\0';
+	for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
+		if (cipher_code == ecryptfs_cipher_code_str_map[i].cipher_code)
+			strcpy(str, ecryptfs_cipher_code_str_map[i].cipher_str);
+	if (str[0] == '\0') {
+		ecryptfs_printk(KERN_WARNING, "Cipher code not recognized: "
+				"[%d]\n", cipher_code);
+		rc = -EINVAL;
+	}
+	return rc;
+}
+
+/**
+ * ecryptfs_read_header_region
+ * @data
+ * @dentry
+ * @nd
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_read_header_region(char *data, struct dentry *dentry,
+				struct vfsmount *mnt)
+{
+	struct file *file;
+	mm_segment_t oldfs;
+	int rc;
+
+	mnt = mntget(mnt);
+	file = dentry_open(dentry, mnt, O_RDONLY);
+	if (IS_ERR(file)) {
+		ecryptfs_printk(KERN_DEBUG, "Error opening file to "
+				"read header region\n");
+		mntput(mnt);
+		rc = PTR_ERR(file);
+		goto out;
+	}
+	file->f_pos = 0;
+	oldfs = get_fs();
+	set_fs(get_ds());
+	/* For releases 0.1 and 0.2, all of the header information
+	 * fits in the first data extent-sized region. */
+	rc = file->f_op->read(file, (char __user *)data,
+			      ECRYPTFS_DEFAULT_EXTENT_SIZE, &file->f_pos);
+	set_fs(oldfs);
+	fput(file);
+	rc = 0;
+out:
+	return rc;
+}
+
+static void
+write_header_metadata(char *virt, struct ecryptfs_crypt_stat *crypt_stat,
+		      size_t *written)
+{
+	u32 header_extent_size;
+	u16 num_header_extents_at_front;
+
+	header_extent_size = (u32)crypt_stat->header_extent_size;
+	num_header_extents_at_front =
+		(u16)crypt_stat->num_header_extents_at_front;
+	header_extent_size = cpu_to_be32(header_extent_size);
+	memcpy(virt, &header_extent_size, 4);
+	virt += 4;
+	num_header_extents_at_front = cpu_to_be16(num_header_extents_at_front);
+	memcpy(virt, &num_header_extents_at_front, 2);
+	(*written) = 6;
+}
+
+struct kmem_cache *ecryptfs_header_cache_0;
+struct kmem_cache *ecryptfs_header_cache_1;
+struct kmem_cache *ecryptfs_header_cache_2;
+
+/**
+ * ecryptfs_write_headers_virt
+ * @page_virt
+ * @crypt_stat
+ * @ecryptfs_dentry
+ *
+ * Format version: 1
+ *
+ *   Header Extent:
+ *     Octets 0-7:        Unencrypted file size (big-endian)
+ *     Octets 8-15:       eCryptfs special marker
+ *     Octets 16-19:      Flags
+ *      Octet 16:         File format version number (between 0 and 255)
+ *      Octets 17-18:     Reserved
+ *      Octet 19:         Bit 1 (lsb): Reserved
+ *                        Bit 2: Encrypted?
+ *                        Bits 3-8: Reserved
+ *     Octets 20-23:      Header extent size (big-endian)
+ *     Octets 24-25:      Number of header extents at front of file
+ *                        (big-endian)
+ *     Octet  26:         Begin RFC 2440 authentication token packet set
+ *   Data Extent 0:
+ *     Lower data (CBC encrypted)
+ *   Data Extent 1:
+ *     Lower data (CBC encrypted)
+ *   ...
+ *
+ * Returns zero on success
+ */
+int ecryptfs_write_headers_virt(char *page_virt,
+				struct ecryptfs_crypt_stat *crypt_stat,
+				struct dentry *ecryptfs_dentry)
+{
+	int rc;
+	size_t written;
+	size_t offset;
+
+	offset = ECRYPTFS_FILE_SIZE_BYTES;
+	write_ecryptfs_marker((page_virt + offset), &written);
+	offset += written;
+	write_ecryptfs_flags((page_virt + offset), crypt_stat, &written);
+	offset += written;
+	write_header_metadata((page_virt + offset), crypt_stat, &written);
+	offset += written;
+	rc = ecryptfs_generate_key_packet_set((page_virt + offset), crypt_stat,
+					      ecryptfs_dentry, &written,
+					      PAGE_CACHE_SIZE - offset);
+	if (rc)
+		ecryptfs_printk(KERN_WARNING, "Error generating key packet "
+				"set; rc = [%d]\n", rc);
+	return rc;
+}
+
+/**
+ * ecryptfs_write_headers
+ * @lower_file: The lower file struct, which was returned from dentry_open
+ *
+ * Write the file headers out.  This will likely involve a userspace
+ * callout, in which the session key is encrypted with one or more
+ * public keys and/or the passphrase necessary to do the encryption is
+ * retrieved via a prompt.  Exactly what happens at this point should
+ * be policy-dependent.
+ *
+ * Returns zero on success; non-zero on error
+ */
+int ecryptfs_write_headers(struct dentry *ecryptfs_dentry,
+			   struct file *lower_file)
+{
+	mm_segment_t oldfs;
+	struct ecryptfs_crypt_stat *crypt_stat;
+	char *page_virt;
+	int current_header_page;
+	int header_pages;
+	int rc = 0;
+
+	crypt_stat = &ecryptfs_inode_to_private(
+		ecryptfs_dentry->d_inode)->crypt_stat;
+	if (likely(ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
+				       ECRYPTFS_ENCRYPTED))) {
+		if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
+					 ECRYPTFS_KEY_VALID)) {
+			ecryptfs_printk(KERN_DEBUG, "Key is "
+					"invalid; bailing out\n");
+			rc = -EINVAL;
+			goto out;
+		}
+	} else {
+		rc = -EINVAL;
+		ecryptfs_printk(KERN_WARNING,
+				"Called with crypt_stat->encrypted == 0\n");
+		goto out;
+	}
+	/* Released in this function */
+	page_virt = kmem_cache_alloc(ecryptfs_header_cache_0, SLAB_USER);
+	if (!page_virt) {
+		ecryptfs_printk(KERN_ERR, "Out of memory\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	memset(page_virt, 0, PAGE_CACHE_SIZE);
+	rc = ecryptfs_write_headers_virt(page_virt, crypt_stat,
+					 ecryptfs_dentry);
+	if (unlikely(rc)) {
+		ecryptfs_printk(KERN_ERR, "Error whilst writing headers\n");
+		memset(page_virt, 0, PAGE_CACHE_SIZE);
+		goto out_free;
+	}
+	ecryptfs_printk(KERN_DEBUG,
+			"Writing key packet set to underlying file\n");
+	lower_file->f_pos = 0;
+	oldfs = get_fs();
+	set_fs(get_ds());
+	ecryptfs_printk(KERN_DEBUG, "Calling lower_file->f_op->"
+			"write() w/ header page; lower_file->f_pos = "
+			"[0x%.16x]\n", lower_file->f_pos);
+	lower_file->f_op->write(lower_file, (char __user *)page_virt,
+				PAGE_CACHE_SIZE, &lower_file->f_pos);
+	header_pages = ((crypt_stat->header_extent_size
+			 * crypt_stat->num_header_extents_at_front)
+			/ PAGE_CACHE_SIZE);
+	memset(page_virt, 0, PAGE_CACHE_SIZE);
+	current_header_page = 1;
+	while (current_header_page < header_pages) {
+		ecryptfs_printk(KERN_DEBUG, "Calling lower_file->f_op->"
+				"write() w/ zero'd page; lower_file->f_pos = "
+				"[0x%.16x]\n", lower_file->f_pos);
+		lower_file->f_op->write(lower_file, (char __user *)page_virt,
+					PAGE_CACHE_SIZE, &lower_file->f_pos);
+		current_header_page++;
+	}
+	set_fs(oldfs);
+	ecryptfs_printk(KERN_DEBUG,
+			"Done writing key packet set to underlying file.\n");
+out_free:
+	kmem_cache_free(ecryptfs_header_cache_0, page_virt);
+out:
+	return rc;
+}
+
+static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
+				 char *virt, int *bytes_read)
+{
+	int rc = 0;
+	u32 header_extent_size;
+	u16 num_header_extents_at_front;
+
+	memcpy(&header_extent_size, virt, 4);
+	header_extent_size = be32_to_cpu(header_extent_size);
+	virt += 4;
+	memcpy(&num_header_extents_at_front, virt, 2);
+	num_header_extents_at_front = be16_to_cpu(num_header_extents_at_front);
+	crypt_stat->header_extent_size = (int)header_extent_size;
+	crypt_stat->num_header_extents_at_front =
+		(int)num_header_extents_at_front;
+	(*bytes_read) = 6;
+	if ((crypt_stat->header_extent_size
+	     * crypt_stat->num_header_extents_at_front)
+	    < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) {
+		rc = -EINVAL;
+		ecryptfs_printk(KERN_WARNING, "Invalid header extent size: "
+				"[%d]\n", crypt_stat->header_extent_size);
+	}
+	return rc;
+}
+
+/**
+ * set_default_header_data
+ *
+ * For version 0 file format; this function is only for backwards
+ * compatibility for files created with the prior versions of
+ * eCryptfs.
+ */
+static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat)
+{
+	crypt_stat->header_extent_size = 4096;
+	crypt_stat->num_header_extents_at_front = 1;
+}
+
+/**
+ * ecryptfs_read_headers_virt
+ *
+ * Read/parse the header data. The header format is detailed in the
+ * comment block for the ecryptfs_write_headers_virt() function.
+ *
+ * Returns zero on success
+ */
+static int ecryptfs_read_headers_virt(char *page_virt,
+				      struct ecryptfs_crypt_stat *crypt_stat,
+				      struct dentry *ecryptfs_dentry)
+{
+	int rc = 0;
+	int offset;
+	int bytes_read;
+
+	ecryptfs_set_default_sizes(crypt_stat);
+	crypt_stat->mount_crypt_stat = &ecryptfs_superblock_to_private(
+		ecryptfs_dentry->d_sb)->mount_crypt_stat;
+	offset = ECRYPTFS_FILE_SIZE_BYTES;
+	rc = contains_ecryptfs_marker(page_virt + offset);
+	if (rc == 0) {
+		rc = -EINVAL;
+		goto out;
+	}
+	offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
+	rc = ecryptfs_process_flags(crypt_stat, (page_virt + offset),
+				    &bytes_read);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error processing flags\n");
+		goto out;
+	}
+	if (crypt_stat->file_version > ECRYPTFS_SUPPORTED_FILE_VERSION) {
+		ecryptfs_printk(KERN_WARNING, "File version is [%d]; only "
+				"file version [%d] is supported by this "
+				"version of eCryptfs\n",
+				crypt_stat->file_version,
+				ECRYPTFS_SUPPORTED_FILE_VERSION);
+		rc = -EINVAL;
+		goto out;
+	}
+	offset += bytes_read;
+	if (crypt_stat->file_version >= 1) {
+		rc = parse_header_metadata(crypt_stat, (page_virt + offset),
+					   &bytes_read);
+		if (rc) {
+			ecryptfs_printk(KERN_WARNING, "Error reading header "
+					"metadata; rc = [%d]\n", rc);
+		}
+		offset += bytes_read;
+	} else
+		set_default_header_data(crypt_stat);
+	rc = ecryptfs_parse_packet_set(crypt_stat, (page_virt + offset),
+				       ecryptfs_dentry);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_read_headers
+ *
+ * Returns zero if valid headers found and parsed; non-zero otherwise
+ */
+int ecryptfs_read_headers(struct dentry *ecryptfs_dentry,
+			  struct file *lower_file)
+{
+	int rc = 0;
+	char *page_virt = NULL;
+	mm_segment_t oldfs;
+	ssize_t bytes_read;
+	struct ecryptfs_crypt_stat *crypt_stat =
+	    &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+
+	/* Read the first page from the underlying file */
+	page_virt = kmem_cache_alloc(ecryptfs_header_cache_1, SLAB_USER);
+	if (!page_virt) {
+		rc = -ENOMEM;
+		ecryptfs_printk(KERN_ERR, "Unable to allocate page_virt\n");
+		goto out;
+	}
+	lower_file->f_pos = 0;
+	oldfs = get_fs();
+	set_fs(get_ds());
+	bytes_read = lower_file->f_op->read(lower_file,
+					    (char __user *)page_virt,
+					    ECRYPTFS_DEFAULT_EXTENT_SIZE,
+					    &lower_file->f_pos);
+	set_fs(oldfs);
+	if (bytes_read != ECRYPTFS_DEFAULT_EXTENT_SIZE) {
+		rc = -EINVAL;
+		goto out;
+	}
+	rc = ecryptfs_read_headers_virt(page_virt, crypt_stat,
+					ecryptfs_dentry);
+	if (rc) {
+		ecryptfs_printk(KERN_DEBUG, "Valid eCryptfs headers not "
+				"found\n");
+		rc = -EINVAL;
+	}
+out:
+	if (page_virt) {
+		memset(page_virt, 0, PAGE_CACHE_SIZE);
+		kmem_cache_free(ecryptfs_header_cache_1, page_virt);
+	}
+	return rc;
+}
+
+/**
+ * ecryptfs_encode_filename - converts a plaintext file name to cipher text
+ * @crypt_stat: The crypt_stat struct associated with the file anem to encode
+ * @name: The plaintext name
+ * @length: The length of the plaintext
+ * @encoded_name: The encypted name
+ *
+ * Encrypts and encodes a filename into something that constitutes a
+ * valid filename for a filesystem, with printable characters.
+ *
+ * We assume that we have a properly initialized crypto context,
+ * pointed to by crypt_stat->tfm.
+ *
+ * TODO: Implement filename decoding and decryption here, in place of
+ * memcpy. We are keeping the framework around for now to (1)
+ * facilitate testing of the components needed to implement filename
+ * encryption and (2) to provide a code base from which other
+ * developers in the community can easily implement this feature.
+ *
+ * Returns the length of encoded filename; negative if error
+ */
+int
+ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+			 const char *name, int length, char **encoded_name)
+{
+	int error = 0;
+
+	(*encoded_name) = kmalloc(length + 2, GFP_KERNEL);
+	if (!(*encoded_name)) {
+		error = -ENOMEM;
+		goto out;
+	}
+	/* TODO: Filename encryption is a scheduled feature for a
+	 * future version of eCryptfs. This function is here only for
+	 * the purpose of providing a framework for other developers
+	 * to easily implement filename encryption. Hint: Replace this
+	 * memcpy() with a call to encrypt and encode the
+	 * filename, the set the length accordingly. */
+	memcpy((void *)(*encoded_name), (void *)name, length);
+	(*encoded_name)[length] = '\0';
+	error = length + 1;
+out:
+	return error;
+}
+
+/**
+ * ecryptfs_decode_filename - converts the cipher text name to plaintext
+ * @crypt_stat: The crypt_stat struct associated with the file
+ * @name: The filename in cipher text
+ * @length: The length of the cipher text name
+ * @decrypted_name: The plaintext name
+ *
+ * Decodes and decrypts the filename.
+ *
+ * We assume that we have a properly initialized crypto context,
+ * pointed to by crypt_stat->tfm.
+ *
+ * TODO: Implement filename decoding and decryption here, in place of
+ * memcpy. We are keeping the framework around for now to (1)
+ * facilitate testing of the components needed to implement filename
+ * encryption and (2) to provide a code base from which other
+ * developers in the community can easily implement this feature.
+ *
+ * Returns the length of decoded filename; negative if error
+ */
+int
+ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+			 const char *name, int length, char **decrypted_name)
+{
+	int error = 0;
+
+	(*decrypted_name) = kmalloc(length + 2, GFP_KERNEL);
+	if (!(*decrypted_name)) {
+		error = -ENOMEM;
+		goto out;
+	}
+	/* TODO: Filename encryption is a scheduled feature for a
+	 * future version of eCryptfs. This function is here only for
+	 * the purpose of providing a framework for other developers
+	 * to easily implement filename encryption. Hint: Replace this
+	 * memcpy() with a call to decode and decrypt the
+	 * filename, the set the length accordingly. */
+	memcpy((void *)(*decrypted_name), (void *)name, length);
+	(*decrypted_name)[length + 1] = '\0';	/* Only for convenience
+						 * in printing out the
+						 * string in debug
+						 * messages */
+	error = length;
+out:
+	return error;
+}
+
+/**
+ * ecryptfs_process_cipher - Perform cipher initialization.
+ * @tfm: Crypto context set by this function
+ * @key_tfm: Crypto context for key material, set by this function
+ * @cipher_name: Name of the cipher.
+ * @key_size: Size of the key in bytes.
+ *
+ * Returns zero on success. Any crypto_tfm structs allocated here
+ * should be released by other functions, such as on a superblock put
+ * event, regardless of whether this function succeeds for fails.
+ */
+int
+ecryptfs_process_cipher(struct crypto_tfm **tfm, struct crypto_tfm **key_tfm,
+			char *cipher_name, size_t key_size)
+{
+	char dummy_key[ECRYPTFS_MAX_KEY_BYTES];
+	int rc;
+
+	*tfm = *key_tfm = NULL;
+	if (key_size > ECRYPTFS_MAX_KEY_BYTES) {
+		rc = -EINVAL;
+		printk(KERN_ERR "Requested key size is [%Zd] bytes; maximum "
+		       "allowable is [%d]\n", key_size, ECRYPTFS_MAX_KEY_BYTES);
+		goto out;
+	}
+	*tfm = crypto_alloc_tfm(cipher_name, (ECRYPTFS_DEFAULT_CHAINING_MODE
+					      | CRYPTO_TFM_REQ_WEAK_KEY));
+	if (!(*tfm)) {
+		rc = -EINVAL;
+		printk(KERN_ERR "Unable to allocate crypto cipher with name "
+		       "[%s]\n", cipher_name);
+		goto out;
+	}
+	*key_tfm = crypto_alloc_tfm(cipher_name, CRYPTO_TFM_REQ_WEAK_KEY);
+	if (!(*key_tfm)) {
+		rc = -EINVAL;
+		printk(KERN_ERR "Unable to allocate crypto cipher with name "
+		       "[%s]\n", cipher_name);
+		goto out;
+	}
+	if (key_size < crypto_tfm_alg_min_keysize(*tfm)) {
+		rc = -EINVAL;
+		printk(KERN_ERR "Request key size is [%Zd]; minimum key size "
+		       "supported by cipher [%s] is [%d]\n", key_size,
+		       cipher_name, crypto_tfm_alg_min_keysize(*tfm));
+		goto out;
+	}
+	if (key_size < crypto_tfm_alg_min_keysize(*key_tfm)) {
+		rc = -EINVAL;
+		printk(KERN_ERR "Request key size is [%Zd]; minimum key size "
+		       "supported by cipher [%s] is [%d]\n", key_size,
+		       cipher_name, crypto_tfm_alg_min_keysize(*key_tfm));
+		goto out;
+	}
+	if (key_size > crypto_tfm_alg_max_keysize(*tfm)) {
+		rc = -EINVAL;
+		printk(KERN_ERR "Request key size is [%Zd]; maximum key size "
+		       "supported by cipher [%s] is [%d]\n", key_size,
+		       cipher_name, crypto_tfm_alg_min_keysize(*tfm));
+		goto out;
+	}
+	if (key_size > crypto_tfm_alg_max_keysize(*key_tfm)) {
+		rc = -EINVAL;
+		printk(KERN_ERR "Request key size is [%Zd]; maximum key size "
+		       "supported by cipher [%s] is [%d]\n", key_size,
+		       cipher_name, crypto_tfm_alg_min_keysize(*key_tfm));
+		goto out;
+	}
+	get_random_bytes(dummy_key, key_size);
+	rc = crypto_cipher_setkey(*tfm, dummy_key, key_size);
+	if (rc) {
+		printk(KERN_ERR "Error attempting to set key of size [%Zd] for "
+		       "cipher [%s]; rc = [%d]\n", key_size, cipher_name, rc);
+		rc = -EINVAL;
+		goto out;
+	}
+	rc = crypto_cipher_setkey(*key_tfm, dummy_key, key_size);
+	if (rc) {
+		printk(KERN_ERR "Error attempting to set key of size [%Zd] for "
+		       "cipher [%s]; rc = [%d]\n", key_size, cipher_name, rc);
+		rc = -EINVAL;
+		goto out;
+	}
+out:
+	return rc;
+}
diff --git a/fs/ecryptfs/debug.c b/fs/ecryptfs/debug.c
new file mode 100644
index 000000000000..61f8e894284f
--- /dev/null
+++ b/fs/ecryptfs/debug.c
@@ -0,0 +1,123 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * Functions only useful for debugging.
+ *
+ * Copyright (C) 2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include "ecryptfs_kernel.h"
+
+/**
+ * ecryptfs_dump_auth_tok - debug function to print auth toks
+ *
+ * This function will print the contents of an ecryptfs authentication
+ * token.
+ */
+void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok)
+{
+	char salt[ECRYPTFS_SALT_SIZE * 2 + 1];
+	char sig[ECRYPTFS_SIG_SIZE_HEX + 1];
+
+	ecryptfs_printk(KERN_DEBUG, "Auth tok at mem loc [%p]:\n",
+			auth_tok);
+	if (ECRYPTFS_CHECK_FLAG(auth_tok->flags, ECRYPTFS_PRIVATE_KEY)) {
+		ecryptfs_printk(KERN_DEBUG, " * private key type\n");
+		ecryptfs_printk(KERN_DEBUG, " * (NO PRIVATE KEY SUPPORT "
+				"IN ECRYPTFS VERSION 0.1)\n");
+	} else {
+		ecryptfs_printk(KERN_DEBUG, " * passphrase type\n");
+		ecryptfs_to_hex(salt, auth_tok->token.password.salt,
+				ECRYPTFS_SALT_SIZE);
+		salt[ECRYPTFS_SALT_SIZE * 2] = '\0';
+		ecryptfs_printk(KERN_DEBUG, " * salt = [%s]\n", salt);
+		if (ECRYPTFS_CHECK_FLAG(auth_tok->token.password.flags,
+					ECRYPTFS_PERSISTENT_PASSWORD)) {
+			ecryptfs_printk(KERN_DEBUG, " * persistent\n");
+		}
+		memcpy(sig, auth_tok->token.password.signature,
+		       ECRYPTFS_SIG_SIZE_HEX);
+		sig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+		ecryptfs_printk(KERN_DEBUG, " * signature = [%s]\n", sig);
+	}
+	ecryptfs_printk(KERN_DEBUG, " * session_key.flags = [0x%x]\n",
+			auth_tok->session_key.flags);
+	if (auth_tok->session_key.flags
+	    & ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT)
+		ecryptfs_printk(KERN_DEBUG,
+				" * Userspace decrypt request set\n");
+	if (auth_tok->session_key.flags
+	    & ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT)
+		ecryptfs_printk(KERN_DEBUG,
+				" * Userspace encrypt request set\n");
+	if (auth_tok->session_key.flags & ECRYPTFS_CONTAINS_DECRYPTED_KEY) {
+		ecryptfs_printk(KERN_DEBUG, " * Contains decrypted key\n");
+		ecryptfs_printk(KERN_DEBUG,
+				" * session_key.decrypted_key_size = [0x%x]\n",
+				auth_tok->session_key.decrypted_key_size);
+		ecryptfs_printk(KERN_DEBUG, " * Decrypted session key "
+				"dump:\n");
+		if (ecryptfs_verbosity > 0)
+			ecryptfs_dump_hex(auth_tok->session_key.decrypted_key,
+					  ECRYPTFS_DEFAULT_KEY_BYTES);
+	}
+	if (auth_tok->session_key.flags & ECRYPTFS_CONTAINS_ENCRYPTED_KEY) {
+		ecryptfs_printk(KERN_DEBUG, " * Contains encrypted key\n");
+		ecryptfs_printk(KERN_DEBUG,
+				" * session_key.encrypted_key_size = [0x%x]\n",
+				auth_tok->session_key.encrypted_key_size);
+		ecryptfs_printk(KERN_DEBUG, " * Encrypted session key "
+				"dump:\n");
+		if (ecryptfs_verbosity > 0)
+			ecryptfs_dump_hex(auth_tok->session_key.encrypted_key,
+					  auth_tok->session_key.
+					  encrypted_key_size);
+	}
+}
+
+/**
+ * ecryptfs_dump_hex - debug hex printer
+ * @data: string of bytes to be printed
+ * @bytes: number of bytes to print
+ *
+ * Dump hexadecimal representation of char array
+ */
+void ecryptfs_dump_hex(char *data, int bytes)
+{
+	int i = 0;
+	int add_newline = 1;
+
+	if (ecryptfs_verbosity < 1)
+		return;
+	if (bytes != 0) {
+		printk(KERN_DEBUG "0x%.2x.", (unsigned char)data[i]);
+		i++;
+	}
+	while (i < bytes) {
+		printk("0x%.2x.", (unsigned char)data[i]);
+		i++;
+		if (i % 16 == 0) {
+			printk("\n");
+			add_newline = 0;
+		} else
+			add_newline = 1;
+	}
+	if (add_newline)
+		printk("\n");
+}
+
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
new file mode 100644
index 000000000000..f0d2a433242b
--- /dev/null
+++ b/fs/ecryptfs/dentry.c
@@ -0,0 +1,87 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * ecryptfs_d_revalidate - revalidate an ecryptfs dentry
+ * @dentry: The ecryptfs dentry
+ * @nd: The associated nameidata
+ *
+ * Called when the VFS needs to revalidate a dentry. This
+ * is called whenever a name lookup finds a dentry in the
+ * dcache. Most filesystems leave this as NULL, because all their
+ * dentries in the dcache are valid.
+ *
+ * Returns 1 if valid, 0 otherwise.
+ *
+ */
+static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+	struct dentry *dentry_save;
+	struct vfsmount *vfsmount_save;
+	int rc = 1;
+
+	if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
+		goto out;
+	dentry_save = nd->dentry;
+	vfsmount_save = nd->mnt;
+	nd->dentry = lower_dentry;
+	nd->mnt = lower_mnt;
+	rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd);
+	nd->dentry = dentry_save;
+	nd->mnt = vfsmount_save;
+out:
+	return rc;
+}
+
+struct kmem_cache *ecryptfs_dentry_info_cache;
+
+/**
+ * ecryptfs_d_release
+ * @dentry: The ecryptfs dentry
+ *
+ * Called when a dentry is really deallocated.
+ */
+static void ecryptfs_d_release(struct dentry *dentry)
+{
+	struct dentry *lower_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	if (ecryptfs_dentry_to_private(dentry))
+		kmem_cache_free(ecryptfs_dentry_info_cache,
+				ecryptfs_dentry_to_private(dentry));
+	if (lower_dentry)
+		dput(lower_dentry);
+	return;
+}
+
+struct dentry_operations ecryptfs_dops = {
+	.d_revalidate = ecryptfs_d_revalidate,
+	.d_release = ecryptfs_d_release,
+};
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
new file mode 100644
index 000000000000..872c9958531a
--- /dev/null
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -0,0 +1,482 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * Kernel declarations.
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#ifndef ECRYPTFS_KERNEL_H
+#define ECRYPTFS_KERNEL_H
+
+#include <keys/user-type.h>
+#include <linux/fs.h>
+#include <linux/scatterlist.h>
+
+/* Version verification for shared data structures w/ userspace */
+#define ECRYPTFS_VERSION_MAJOR 0x00
+#define ECRYPTFS_VERSION_MINOR 0x04
+#define ECRYPTFS_SUPPORTED_FILE_VERSION 0x01
+/* These flags indicate which features are supported by the kernel
+ * module; userspace tools such as the mount helper read
+ * ECRYPTFS_VERSIONING_MASK from a sysfs handle in order to determine
+ * how to behave. */
+#define ECRYPTFS_VERSIONING_PASSPHRASE 0x00000001
+#define ECRYPTFS_VERSIONING_PUBKEY 0x00000002
+#define ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH 0x00000004
+#define ECRYPTFS_VERSIONING_POLICY 0x00000008
+#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \
+                                  | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH)
+
+#define ECRYPTFS_MAX_PASSWORD_LENGTH 64
+#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH
+#define ECRYPTFS_SALT_SIZE 8
+#define ECRYPTFS_SALT_SIZE_HEX (ECRYPTFS_SALT_SIZE*2)
+/* The original signature size is only for what is stored on disk; all
+ * in-memory representations are expanded hex, so it better adapted to
+ * be passed around or referenced on the command line */
+#define ECRYPTFS_SIG_SIZE 8
+#define ECRYPTFS_SIG_SIZE_HEX (ECRYPTFS_SIG_SIZE*2)
+#define ECRYPTFS_PASSWORD_SIG_SIZE ECRYPTFS_SIG_SIZE_HEX
+#define ECRYPTFS_MAX_KEY_BYTES 64
+#define ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES 512
+#define ECRYPTFS_DEFAULT_IV_BYTES 16
+#define ECRYPTFS_FILE_VERSION 0x01
+#define ECRYPTFS_DEFAULT_HEADER_EXTENT_SIZE 8192
+#define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096
+#define ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE 8192
+
+#define RFC2440_CIPHER_DES3_EDE 0x02
+#define RFC2440_CIPHER_CAST_5 0x03
+#define RFC2440_CIPHER_BLOWFISH 0x04
+#define RFC2440_CIPHER_AES_128 0x07
+#define RFC2440_CIPHER_AES_192 0x08
+#define RFC2440_CIPHER_AES_256 0x09
+#define RFC2440_CIPHER_TWOFISH 0x0a
+#define RFC2440_CIPHER_CAST_6 0x0b
+
+#define ECRYPTFS_SET_FLAG(flag_bit_vector, flag) (flag_bit_vector |= (flag))
+#define ECRYPTFS_CLEAR_FLAG(flag_bit_vector, flag) (flag_bit_vector &= ~(flag))
+#define ECRYPTFS_CHECK_FLAG(flag_bit_vector, flag) (flag_bit_vector & (flag))
+
+/**
+ * For convenience, we may need to pass around the encrypted session
+ * key between kernel and userspace because the authentication token
+ * may not be extractable.  For example, the TPM may not release the
+ * private key, instead requiring the encrypted data and returning the
+ * decrypted data.
+ */
+struct ecryptfs_session_key {
+#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT 0x00000001
+#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT 0x00000002
+#define ECRYPTFS_CONTAINS_DECRYPTED_KEY 0x00000004
+#define ECRYPTFS_CONTAINS_ENCRYPTED_KEY 0x00000008
+	u32 flags;
+	u32 encrypted_key_size;
+	u32 decrypted_key_size;
+	u8 encrypted_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES];
+	u8 decrypted_key[ECRYPTFS_MAX_KEY_BYTES];
+};
+
+struct ecryptfs_password {
+	u32 password_bytes;
+	s32 hash_algo;
+	u32 hash_iterations;
+	u32 session_key_encryption_key_bytes;
+#define ECRYPTFS_PERSISTENT_PASSWORD 0x01
+#define ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET 0x02
+	u32 flags;
+	/* Iterated-hash concatenation of salt and passphrase */
+	u8 session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES];
+	u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1];
+	/* Always in expanded hex */
+	u8 salt[ECRYPTFS_SALT_SIZE];
+};
+
+enum ecryptfs_token_types {ECRYPTFS_PASSWORD, ECRYPTFS_PRIVATE_KEY};
+
+/* May be a password or a private key */
+struct ecryptfs_auth_tok {
+	u16 version; /* 8-bit major and 8-bit minor */
+	u16 token_type;
+	u32 flags;
+	struct ecryptfs_session_key session_key;
+	u8 reserved[32];
+	union {
+		struct ecryptfs_password password;
+		/* Private key is in future eCryptfs releases */
+	} token;
+} __attribute__ ((packed));
+
+void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok);
+extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size);
+extern void ecryptfs_from_hex(char *dst, char *src, int dst_size);
+
+struct ecryptfs_key_record {
+	unsigned char type;
+	size_t enc_key_size;
+	unsigned char sig[ECRYPTFS_SIG_SIZE];
+	unsigned char enc_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES];
+};
+
+struct ecryptfs_auth_tok_list {
+	struct ecryptfs_auth_tok *auth_tok;
+	struct list_head list;
+};
+
+struct ecryptfs_crypt_stat;
+struct ecryptfs_mount_crypt_stat;
+
+struct ecryptfs_page_crypt_context {
+	struct page *page;
+#define ECRYPTFS_PREPARE_COMMIT_MODE 0
+#define ECRYPTFS_WRITEPAGE_MODE      1
+	unsigned int mode;
+	union {
+		struct file *lower_file;
+		struct writeback_control *wbc;
+	} param;
+};
+
+static inline struct ecryptfs_auth_tok *
+ecryptfs_get_key_payload_data(struct key *key)
+{
+	return (struct ecryptfs_auth_tok *)
+		(((struct user_key_payload*)key->payload.data)->data);
+}
+
+#define ECRYPTFS_SUPER_MAGIC 0xf15f
+#define ECRYPTFS_MAX_KEYSET_SIZE 1024
+#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32
+#define ECRYPTFS_MAX_NUM_ENC_KEYS 64
+#define ECRYPTFS_MAX_NUM_KEYSIGS 2 /* TODO: Make this a linked list */
+#define ECRYPTFS_MAX_IV_BYTES 16	/* 128 bits */
+#define ECRYPTFS_SALT_BYTES 2
+#define MAGIC_ECRYPTFS_MARKER 0x3c81b7f5
+#define MAGIC_ECRYPTFS_MARKER_SIZE_BYTES 8	/* 4*2 */
+#define ECRYPTFS_FILE_SIZE_BYTES 8
+#define ECRYPTFS_DEFAULT_CIPHER "aes"
+#define ECRYPTFS_DEFAULT_KEY_BYTES 16
+#define ECRYPTFS_DEFAULT_CHAINING_MODE CRYPTO_TFM_MODE_CBC
+#define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C
+#define ECRYPTFS_TAG_11_PACKET_TYPE 0xED
+#define MD5_DIGEST_SIZE 16
+
+/**
+ * This is the primary struct associated with each encrypted file.
+ *
+ * TODO: cache align/pack?
+ */
+struct ecryptfs_crypt_stat {
+#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001
+#define ECRYPTFS_POLICY_APPLIED     0x00000002
+#define ECRYPTFS_NEW_FILE           0x00000004
+#define ECRYPTFS_ENCRYPTED          0x00000008
+#define ECRYPTFS_SECURITY_WARNING   0x00000010
+#define ECRYPTFS_ENABLE_HMAC        0x00000020
+#define ECRYPTFS_ENCRYPT_IV_PAGES   0x00000040
+#define ECRYPTFS_KEY_VALID          0x00000080
+	u32 flags;
+	unsigned int file_version;
+	size_t iv_bytes;
+	size_t num_keysigs;
+	size_t header_extent_size;
+	size_t num_header_extents_at_front;
+	size_t extent_size; /* Data extent size; default is 4096 */
+	size_t key_size;
+	size_t extent_shift;
+	unsigned int extent_mask;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+	struct crypto_tfm *tfm;
+	struct crypto_tfm *md5_tfm; /* Crypto context for generating
+				     * the initialization vectors */
+	unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
+	unsigned char key[ECRYPTFS_MAX_KEY_BYTES];
+	unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES];
+	unsigned char keysigs[ECRYPTFS_MAX_NUM_KEYSIGS][ECRYPTFS_SIG_SIZE_HEX];
+	struct mutex cs_tfm_mutex;
+	struct mutex cs_md5_tfm_mutex;
+	struct mutex cs_mutex;
+};
+
+/* inode private data. */
+struct ecryptfs_inode_info {
+	struct inode vfs_inode;
+	struct inode *wii_inode;
+	struct ecryptfs_crypt_stat crypt_stat;
+};
+
+/* dentry private data. Each dentry must keep track of a lower
+ * vfsmount too. */
+struct ecryptfs_dentry_info {
+	struct dentry *wdi_dentry;
+	struct vfsmount *lower_mnt;
+	struct ecryptfs_crypt_stat *crypt_stat;
+};
+
+/**
+ * This struct is to enable a mount-wide passphrase/salt combo. This
+ * is more or less a stopgap to provide similar functionality to other
+ * crypto filesystems like EncFS or CFS until full policy support is
+ * implemented in eCryptfs.
+ */
+struct ecryptfs_mount_crypt_stat {
+	/* Pointers to memory we do not own, do not free these */
+#define ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED 0x00000001
+	u32 flags;
+	struct ecryptfs_auth_tok *global_auth_tok;
+	struct key *global_auth_tok_key;
+	size_t global_default_cipher_key_size;
+	struct crypto_tfm *global_key_tfm;
+	struct mutex global_key_tfm_mutex;
+	unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE
+						 + 1];
+	unsigned char global_auth_tok_sig[ECRYPTFS_SIG_SIZE_HEX + 1];
+};
+
+/* superblock private data. */
+struct ecryptfs_sb_info {
+	struct super_block *wsi_sb;
+	struct ecryptfs_mount_crypt_stat mount_crypt_stat;
+};
+
+/* file private data. */
+struct ecryptfs_file_info {
+	struct file *wfi_file;
+	struct ecryptfs_crypt_stat *crypt_stat;
+};
+
+/* auth_tok <=> encrypted_session_key mappings */
+struct ecryptfs_auth_tok_list_item {
+	unsigned char encrypted_session_key[ECRYPTFS_MAX_KEY_BYTES];
+	struct list_head list;
+	struct ecryptfs_auth_tok auth_tok;
+};
+
+static inline struct ecryptfs_file_info *
+ecryptfs_file_to_private(struct file *file)
+{
+	return (struct ecryptfs_file_info *)file->private_data;
+}
+
+static inline void
+ecryptfs_set_file_private(struct file *file,
+			  struct ecryptfs_file_info *file_info)
+{
+	file->private_data = file_info;
+}
+
+static inline struct file *ecryptfs_file_to_lower(struct file *file)
+{
+	return ((struct ecryptfs_file_info *)file->private_data)->wfi_file;
+}
+
+static inline void
+ecryptfs_set_file_lower(struct file *file, struct file *lower_file)
+{
+	((struct ecryptfs_file_info *)file->private_data)->wfi_file =
+		lower_file;
+}
+
+static inline struct ecryptfs_inode_info *
+ecryptfs_inode_to_private(struct inode *inode)
+{
+	return container_of(inode, struct ecryptfs_inode_info, vfs_inode);
+}
+
+static inline struct inode *ecryptfs_inode_to_lower(struct inode *inode)
+{
+	return ecryptfs_inode_to_private(inode)->wii_inode;
+}
+
+static inline void
+ecryptfs_set_inode_lower(struct inode *inode, struct inode *lower_inode)
+{
+	ecryptfs_inode_to_private(inode)->wii_inode = lower_inode;
+}
+
+static inline struct ecryptfs_sb_info *
+ecryptfs_superblock_to_private(struct super_block *sb)
+{
+	return (struct ecryptfs_sb_info *)sb->s_fs_info;
+}
+
+static inline void
+ecryptfs_set_superblock_private(struct super_block *sb,
+				struct ecryptfs_sb_info *sb_info)
+{
+	sb->s_fs_info = sb_info;
+}
+
+static inline struct super_block *
+ecryptfs_superblock_to_lower(struct super_block *sb)
+{
+	return ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb;
+}
+
+static inline void
+ecryptfs_set_superblock_lower(struct super_block *sb,
+			      struct super_block *lower_sb)
+{
+	((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb = lower_sb;
+}
+
+static inline struct ecryptfs_dentry_info *
+ecryptfs_dentry_to_private(struct dentry *dentry)
+{
+	return (struct ecryptfs_dentry_info *)dentry->d_fsdata;
+}
+
+static inline void
+ecryptfs_set_dentry_private(struct dentry *dentry,
+			    struct ecryptfs_dentry_info *dentry_info)
+{
+	dentry->d_fsdata = dentry_info;
+}
+
+static inline struct dentry *
+ecryptfs_dentry_to_lower(struct dentry *dentry)
+{
+	return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->wdi_dentry;
+}
+
+static inline void
+ecryptfs_set_dentry_lower(struct dentry *dentry, struct dentry *lower_dentry)
+{
+	((struct ecryptfs_dentry_info *)dentry->d_fsdata)->wdi_dentry =
+		lower_dentry;
+}
+
+static inline struct vfsmount *
+ecryptfs_dentry_to_lower_mnt(struct dentry *dentry)
+{
+	return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_mnt;
+}
+
+static inline void
+ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt)
+{
+	((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_mnt =
+		lower_mnt;
+}
+
+#define ecryptfs_printk(type, fmt, arg...) \
+        __ecryptfs_printk(type "%s: " fmt, __FUNCTION__, ## arg);
+void __ecryptfs_printk(const char *fmt, ...);
+
+extern const struct file_operations ecryptfs_main_fops;
+extern const struct file_operations ecryptfs_dir_fops;
+extern struct inode_operations ecryptfs_main_iops;
+extern struct inode_operations ecryptfs_dir_iops;
+extern struct inode_operations ecryptfs_symlink_iops;
+extern struct super_operations ecryptfs_sops;
+extern struct dentry_operations ecryptfs_dops;
+extern struct address_space_operations ecryptfs_aops;
+extern int ecryptfs_verbosity;
+
+extern struct kmem_cache *ecryptfs_auth_tok_list_item_cache;
+extern struct kmem_cache *ecryptfs_file_info_cache;
+extern struct kmem_cache *ecryptfs_dentry_info_cache;
+extern struct kmem_cache *ecryptfs_inode_info_cache;
+extern struct kmem_cache *ecryptfs_sb_info_cache;
+extern struct kmem_cache *ecryptfs_header_cache_0;
+extern struct kmem_cache *ecryptfs_header_cache_1;
+extern struct kmem_cache *ecryptfs_header_cache_2;
+extern struct kmem_cache *ecryptfs_lower_page_cache;
+
+int ecryptfs_interpose(struct dentry *hidden_dentry,
+		       struct dentry *this_dentry, struct super_block *sb,
+		       int flag);
+int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
+int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+			     const char *name, int length,
+			     char **decrypted_name);
+int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
+			     const char *name, int length,
+			     char **encoded_name);
+struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
+void ecryptfs_copy_attr_atime(struct inode *dest, const struct inode *src);
+void ecryptfs_copy_attr_all(struct inode *dest, const struct inode *src);
+void ecryptfs_copy_inode_size(struct inode *dst, const struct inode *src);
+void ecryptfs_dump_hex(char *data, int bytes);
+int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
+			int sg_size);
+int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat);
+void ecryptfs_rotate_iv(unsigned char *iv);
+void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
+void ecryptfs_destruct_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat);
+void ecryptfs_destruct_mount_crypt_stat(
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
+int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat);
+int ecryptfs_write_inode_size_to_header(struct file *lower_file,
+					struct inode *lower_inode,
+					struct inode *inode);
+int ecryptfs_get_lower_page(struct page **lower_page, struct inode *lower_inode,
+			    struct file *lower_file,
+			    unsigned long lower_page_index, int byte_offset,
+			    int region_bytes);
+int
+ecryptfs_commit_lower_page(struct page *lower_page, struct inode *lower_inode,
+			   struct file *lower_file, int byte_offset,
+			   int region_size);
+int ecryptfs_copy_page_to_lower(struct page *page, struct inode *lower_inode,
+				struct file *lower_file);
+int ecryptfs_do_readpage(struct file *file, struct page *page,
+			 pgoff_t lower_page_index);
+int ecryptfs_grab_and_map_lower_page(struct page **lower_page,
+				     char **lower_virt,
+				     struct inode *lower_inode,
+				     unsigned long lower_page_index);
+int ecryptfs_writepage_and_release_lower_page(struct page *lower_page,
+					      struct inode *lower_inode,
+					      struct writeback_control *wbc);
+int ecryptfs_encrypt_page(struct ecryptfs_page_crypt_context *ctx);
+int ecryptfs_decrypt_page(struct file *file, struct page *page);
+int ecryptfs_write_headers(struct dentry *ecryptfs_dentry,
+			   struct file *lower_file);
+int ecryptfs_write_headers_virt(char *page_virt,
+				struct ecryptfs_crypt_stat *crypt_stat,
+				struct dentry *ecryptfs_dentry);
+int ecryptfs_read_headers(struct dentry *ecryptfs_dentry,
+			  struct file *lower_file);
+int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
+int contains_ecryptfs_marker(char *data);
+int ecryptfs_read_header_region(char *data, struct dentry *dentry,
+				struct vfsmount *mnt);
+u16 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat);
+int ecryptfs_cipher_code_to_string(char *str, u16 cipher_code);
+void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
+int ecryptfs_generate_key_packet_set(char *dest_base,
+				     struct ecryptfs_crypt_stat *crypt_stat,
+				     struct dentry *ecryptfs_dentry,
+				     size_t *len, size_t max);
+int process_request_key_err(long err_code);
+int
+ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
+			  unsigned char *src, struct dentry *ecryptfs_dentry);
+int ecryptfs_truncate(struct dentry *dentry, loff_t new_length);
+int
+ecryptfs_process_cipher(struct crypto_tfm **tfm, struct crypto_tfm **key_tfm,
+			char *cipher_name, size_t key_size);
+int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode);
+int ecryptfs_inode_set(struct inode *inode, void *lower_inode);
+void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode);
+
+#endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
new file mode 100644
index 000000000000..c8550c9f9cd2
--- /dev/null
+++ b/fs/ecryptfs/file.c
@@ -0,0 +1,440 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2004 Erez Zadok
+ * Copyright (C) 2001-2004 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
+ *   		Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/security.h>
+#include <linux/smp_lock.h>
+#include <linux/compat.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * ecryptfs_llseek
+ * @file: File we are seeking in
+ * @offset: The offset to seek to
+ * @origin: 2 - offset from i_size; 1 - offset from f_pos
+ *
+ * Returns the position we have seeked to, or negative on error
+ */
+static loff_t ecryptfs_llseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t rv;
+	loff_t new_end_pos;
+	int rc;
+	int expanding_file = 0;
+	struct inode *inode = file->f_mapping->host;
+
+	/* If our offset is past the end of our file, we're going to
+	 * need to grow it so we have a valid length of 0's */
+	new_end_pos = offset;
+	switch (origin) {
+	case 2:
+		new_end_pos += i_size_read(inode);
+		expanding_file = 1;
+		break;
+	case 1:
+		new_end_pos += file->f_pos;
+		if (new_end_pos > i_size_read(inode)) {
+			ecryptfs_printk(KERN_DEBUG, "new_end_pos(=[0x%.16x]) "
+					"> i_size_read(inode)(=[0x%.16x])\n",
+					new_end_pos, i_size_read(inode));
+			expanding_file = 1;
+		}
+		break;
+	default:
+		if (new_end_pos > i_size_read(inode)) {
+			ecryptfs_printk(KERN_DEBUG, "new_end_pos(=[0x%.16x]) "
+					"> i_size_read(inode)(=[0x%.16x])\n",
+					new_end_pos, i_size_read(inode));
+			expanding_file = 1;
+		}
+	}
+	ecryptfs_printk(KERN_DEBUG, "new_end_pos = [0x%.16x]\n", new_end_pos);
+	if (expanding_file) {
+		rc = ecryptfs_truncate(file->f_dentry, new_end_pos);
+		if (rc) {
+			rv = rc;
+			ecryptfs_printk(KERN_ERR, "Error on attempt to "
+					"truncate to (higher) offset [0x%.16x];"
+					" rc = [%d]\n", new_end_pos, rc);
+			goto out;
+		}
+	}
+	rv = generic_file_llseek(file, offset, origin);
+out:
+	return rv;
+}
+
+/**
+ * ecryptfs_read_update_atime
+ *
+ * generic_file_read updates the atime of upper layer inode.  But, it
+ * doesn't give us a chance to update the atime of the lower layer
+ * inode.  This function is a wrapper to generic_file_read.  It
+ * updates the atime of the lower level inode if generic_file_read
+ * returns without any errors. This is to be used only for file reads.
+ * The function to be used for directory reads is ecryptfs_read.
+ */
+static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
+				const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
+{
+	int rc;
+	struct dentry *lower_dentry;
+	struct vfsmount *lower_vfsmount;
+	struct file *file = iocb->ki_filp;
+
+	rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
+	/*
+	 * Even though this is a async interface, we need to wait
+	 * for IO to finish to update atime
+	 */
+	if (-EIOCBQUEUED == rc)
+		rc = wait_on_sync_kiocb(iocb);
+	if (rc >= 0) {
+		lower_dentry = ecryptfs_dentry_to_lower(file->f_dentry);
+		lower_vfsmount = ecryptfs_dentry_to_lower_mnt(file->f_dentry);
+		touch_atime(lower_vfsmount, lower_dentry);
+	}
+	return rc;
+}
+
+struct ecryptfs_getdents_callback {
+	void *dirent;
+	struct dentry *dentry;
+	filldir_t filldir;
+	int err;
+	int filldir_called;
+	int entries_written;
+};
+
+/* Inspired by generic filldir in fs/readir.c */
+static int
+ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset,
+		 u64 ino, unsigned int d_type)
+{
+	struct ecryptfs_crypt_stat *crypt_stat;
+	struct ecryptfs_getdents_callback *buf =
+	    (struct ecryptfs_getdents_callback *)dirent;
+	int rc;
+	int decoded_length;
+	char *decoded_name;
+
+	crypt_stat = ecryptfs_dentry_to_private(buf->dentry)->crypt_stat;
+	buf->filldir_called++;
+	decoded_length = ecryptfs_decode_filename(crypt_stat, name, namelen,
+						  &decoded_name);
+	if (decoded_length < 0) {
+		rc = decoded_length;
+		goto out;
+	}
+	rc = buf->filldir(buf->dirent, decoded_name, decoded_length, offset,
+			  ino, d_type);
+	kfree(decoded_name);
+	if (rc >= 0)
+		buf->entries_written++;
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_readdir
+ * @file: The ecryptfs file struct
+ * @dirent: Directory entry
+ * @filldir: The filldir callback function
+ */
+static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+	int rc;
+	struct file *lower_file;
+	struct inode *inode;
+	struct ecryptfs_getdents_callback buf;
+
+	lower_file = ecryptfs_file_to_lower(file);
+	lower_file->f_pos = file->f_pos;
+	inode = file->f_dentry->d_inode;
+	memset(&buf, 0, sizeof(buf));
+	buf.dirent = dirent;
+	buf.dentry = file->f_dentry;
+	buf.filldir = filldir;
+retry:
+	buf.filldir_called = 0;
+	buf.entries_written = 0;
+	buf.err = 0;
+	rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf);
+	if (buf.err)
+		rc = buf.err;
+	if (buf.filldir_called && !buf.entries_written)
+		goto retry;
+	file->f_pos = lower_file->f_pos;
+	if (rc >= 0)
+		ecryptfs_copy_attr_atime(inode, lower_file->f_dentry->d_inode);
+	return rc;
+}
+
+struct kmem_cache *ecryptfs_file_info_cache;
+
+/**
+ * ecryptfs_open
+ * @inode: inode speciying file to open
+ * @file: Structure to return filled in
+ *
+ * Opens the file specified by inode.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_open(struct inode *inode, struct file *file)
+{
+	int rc = 0;
+	struct ecryptfs_crypt_stat *crypt_stat = NULL;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+	struct dentry *ecryptfs_dentry = file->f_dentry;
+	/* Private value of ecryptfs_dentry allocated in
+	 * ecryptfs_lookup() */
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+	struct inode *lower_inode = NULL;
+	struct file *lower_file = NULL;
+	struct vfsmount *lower_mnt;
+	struct ecryptfs_file_info *file_info;
+	int lower_flags;
+
+	/* Released in ecryptfs_release or end of function if failure */
+	file_info = kmem_cache_alloc(ecryptfs_file_info_cache, SLAB_KERNEL);
+	ecryptfs_set_file_private(file, file_info);
+	if (!file_info) {
+		ecryptfs_printk(KERN_ERR,
+				"Error attempting to allocate memory\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	memset(file_info, 0, sizeof(*file_info));
+	lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+	crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+	mount_crypt_stat = &ecryptfs_superblock_to_private(
+		ecryptfs_dentry->d_sb)->mount_crypt_stat;
+	mutex_lock(&crypt_stat->cs_mutex);
+	if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_POLICY_APPLIED)) {
+		ecryptfs_printk(KERN_DEBUG, "Setting flags for stat...\n");
+		/* Policy code enabled in future release */
+		ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_POLICY_APPLIED);
+		ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED);
+	}
+	mutex_unlock(&crypt_stat->cs_mutex);
+	/* This mntget & dget is undone via fput when the file is released */
+	dget(lower_dentry);
+	lower_flags = file->f_flags;
+	if ((lower_flags & O_ACCMODE) == O_WRONLY)
+		lower_flags = (lower_flags & O_ACCMODE) | O_RDWR;
+	if (file->f_flags & O_APPEND)
+		lower_flags &= ~O_APPEND;
+	lower_mnt = ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry);
+	mntget(lower_mnt);
+	/* Corresponding fput() in ecryptfs_release() */
+	lower_file = dentry_open(lower_dentry, lower_mnt, lower_flags);
+	if (IS_ERR(lower_file)) {
+		rc = PTR_ERR(lower_file);
+		ecryptfs_printk(KERN_ERR, "Error opening lower file\n");
+		goto out_puts;
+	}
+	ecryptfs_set_file_lower(file, lower_file);
+	/* Isn't this check the same as the one in lookup? */
+	lower_inode = lower_dentry->d_inode;
+	if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
+		ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
+		ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED);
+		rc = 0;
+		goto out;
+	}
+	mutex_lock(&crypt_stat->cs_mutex);
+	if (i_size_read(lower_inode) < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) {
+		if (!(mount_crypt_stat->flags
+		      & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) {
+			rc = -EIO;
+			printk(KERN_WARNING "Attempt to read file that is "
+			       "not in a valid eCryptfs format, and plaintext "
+			       "passthrough mode is not enabled; returning "
+			       "-EIO\n");
+			mutex_unlock(&crypt_stat->cs_mutex);
+			goto out_puts;
+		}
+		crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
+		rc = 0;
+		mutex_unlock(&crypt_stat->cs_mutex);
+		goto out;
+	} else if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
+					ECRYPTFS_POLICY_APPLIED)
+		   || !ECRYPTFS_CHECK_FLAG(crypt_stat->flags,
+					   ECRYPTFS_KEY_VALID)) {
+		rc = ecryptfs_read_headers(ecryptfs_dentry, lower_file);
+		if (rc) {
+			ecryptfs_printk(KERN_DEBUG,
+					"Valid headers not found\n");
+			if (!(mount_crypt_stat->flags
+			      & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) {
+				rc = -EIO;
+				printk(KERN_WARNING "Attempt to read file that "
+				       "is not in a valid eCryptfs format, "
+				       "and plaintext passthrough mode is not "
+				       "enabled; returning -EIO\n");
+				mutex_unlock(&crypt_stat->cs_mutex);
+				goto out_puts;
+			}
+			ECRYPTFS_CLEAR_FLAG(crypt_stat->flags,
+					    ECRYPTFS_ENCRYPTED);
+			rc = 0;
+			mutex_unlock(&crypt_stat->cs_mutex);
+			goto out;
+		}
+	}
+	mutex_unlock(&crypt_stat->cs_mutex);
+	ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = [0x%.16x] "
+			"size: [0x%.16x]\n", inode, inode->i_ino,
+			i_size_read(inode));
+	ecryptfs_set_file_lower(file, lower_file);
+	goto out;
+out_puts:
+	mntput(lower_mnt);
+	dput(lower_dentry);
+	kmem_cache_free(ecryptfs_file_info_cache,
+			ecryptfs_file_to_private(file));
+out:
+	return rc;
+}
+
+static int ecryptfs_flush(struct file *file, fl_owner_t td)
+{
+	int rc = 0;
+	struct file *lower_file = NULL;
+
+	lower_file = ecryptfs_file_to_lower(file);
+	if (lower_file->f_op && lower_file->f_op->flush)
+		rc = lower_file->f_op->flush(lower_file, td);
+	return rc;
+}
+
+static int ecryptfs_release(struct inode *inode, struct file *file)
+{
+	struct file *lower_file = ecryptfs_file_to_lower(file);
+	struct ecryptfs_file_info *file_info = ecryptfs_file_to_private(file);
+	struct inode *lower_inode = ecryptfs_inode_to_lower(inode);
+
+	fput(lower_file);
+	inode->i_blocks = lower_inode->i_blocks;
+	kmem_cache_free(ecryptfs_file_info_cache, file_info);
+	return 0;
+}
+
+static int
+ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct file *lower_file = ecryptfs_file_to_lower(file);
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	struct inode *lower_inode = lower_dentry->d_inode;
+	int rc = -EINVAL;
+
+	if (lower_inode->i_fop->fsync) {
+		mutex_lock(&lower_inode->i_mutex);
+		rc = lower_inode->i_fop->fsync(lower_file, lower_dentry,
+					       datasync);
+		mutex_unlock(&lower_inode->i_mutex);
+	}
+	return rc;
+}
+
+static int ecryptfs_fasync(int fd, struct file *file, int flag)
+{
+	int rc = 0;
+	struct file *lower_file = NULL;
+
+	lower_file = ecryptfs_file_to_lower(file);
+	if (lower_file->f_op && lower_file->f_op->fasync)
+		rc = lower_file->f_op->fasync(fd, lower_file, flag);
+	return rc;
+}
+
+static ssize_t ecryptfs_sendfile(struct file *file, loff_t * ppos,
+				 size_t count, read_actor_t actor, void *target)
+{
+	struct file *lower_file = NULL;
+	int rc = -EINVAL;
+
+	lower_file = ecryptfs_file_to_lower(file);
+	if (lower_file->f_op && lower_file->f_op->sendfile)
+		rc = lower_file->f_op->sendfile(lower_file, ppos, count,
+						actor, target);
+
+	return rc;
+}
+
+static int ecryptfs_ioctl(struct inode *inode, struct file *file,
+			  unsigned int cmd, unsigned long arg);
+
+const struct file_operations ecryptfs_dir_fops = {
+	.readdir = ecryptfs_readdir,
+	.ioctl = ecryptfs_ioctl,
+	.mmap = generic_file_mmap,
+	.open = ecryptfs_open,
+	.flush = ecryptfs_flush,
+	.release = ecryptfs_release,
+	.fsync = ecryptfs_fsync,
+	.fasync = ecryptfs_fasync,
+	.sendfile = ecryptfs_sendfile,
+};
+
+const struct file_operations ecryptfs_main_fops = {
+	.llseek = ecryptfs_llseek,
+	.read = do_sync_read,
+	.aio_read = ecryptfs_read_update_atime,
+	.write = do_sync_write,
+	.aio_write = generic_file_aio_write,
+	.readdir = ecryptfs_readdir,
+	.ioctl = ecryptfs_ioctl,
+	.mmap = generic_file_mmap,
+	.open = ecryptfs_open,
+	.flush = ecryptfs_flush,
+	.release = ecryptfs_release,
+	.fsync = ecryptfs_fsync,
+	.fasync = ecryptfs_fasync,
+	.sendfile = ecryptfs_sendfile,
+};
+
+static int
+ecryptfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+	       unsigned long arg)
+{
+	int rc = 0;
+	struct file *lower_file = NULL;
+
+	if (ecryptfs_file_to_private(file))
+		lower_file = ecryptfs_file_to_lower(file);
+	if (lower_file && lower_file->f_op && lower_file->f_op->ioctl)
+		rc = lower_file->f_op->ioctl(ecryptfs_inode_to_lower(inode),
+					     lower_file, cmd, arg);
+	else
+		rc = -ENOTTY;
+	return rc;
+}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
new file mode 100644
index 000000000000..efdd2b7b62d7
--- /dev/null
+++ b/fs/ecryptfs/inode.c
@@ -0,0 +1,1079 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2004 Erez Zadok
+ * Copyright (C) 2001-2004 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *              Michael C. Thompsion <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/crypto.h>
+#include "ecryptfs_kernel.h"
+
+static struct dentry *lock_parent(struct dentry *dentry)
+{
+	struct dentry *dir;
+
+	dir = dget(dentry->d_parent);
+	mutex_lock(&(dir->d_inode->i_mutex));
+	return dir;
+}
+
+static void unlock_parent(struct dentry *dentry)
+{
+	mutex_unlock(&(dentry->d_parent->d_inode->i_mutex));
+	dput(dentry->d_parent);
+}
+
+static void unlock_dir(struct dentry *dir)
+{
+	mutex_unlock(&dir->d_inode->i_mutex);
+	dput(dir);
+}
+
+void ecryptfs_copy_inode_size(struct inode *dst, const struct inode *src)
+{
+	i_size_write(dst, i_size_read((struct inode *)src));
+	dst->i_blocks = src->i_blocks;
+}
+
+void ecryptfs_copy_attr_atime(struct inode *dest, const struct inode *src)
+{
+	dest->i_atime = src->i_atime;
+}
+
+static void ecryptfs_copy_attr_times(struct inode *dest,
+				     const struct inode *src)
+{
+	dest->i_atime = src->i_atime;
+	dest->i_mtime = src->i_mtime;
+	dest->i_ctime = src->i_ctime;
+}
+
+static void ecryptfs_copy_attr_timesizes(struct inode *dest,
+					 const struct inode *src)
+{
+	dest->i_atime = src->i_atime;
+	dest->i_mtime = src->i_mtime;
+	dest->i_ctime = src->i_ctime;
+	ecryptfs_copy_inode_size(dest, src);
+}
+
+void ecryptfs_copy_attr_all(struct inode *dest, const struct inode *src)
+{
+	dest->i_mode = src->i_mode;
+	dest->i_nlink = src->i_nlink;
+	dest->i_uid = src->i_uid;
+	dest->i_gid = src->i_gid;
+	dest->i_rdev = src->i_rdev;
+	dest->i_atime = src->i_atime;
+	dest->i_mtime = src->i_mtime;
+	dest->i_ctime = src->i_ctime;
+	dest->i_blkbits = src->i_blkbits;
+	dest->i_flags = src->i_flags;
+}
+
+/**
+ * ecryptfs_create_underlying_file
+ * @lower_dir_inode: inode of the parent in the lower fs of the new file
+ * @lower_dentry: New file's dentry in the lower fs
+ * @ecryptfs_dentry: New file's dentry in ecryptfs
+ * @mode: The mode of the new file
+ * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
+ *
+ * Creates the file in the lower file system.
+ *
+ * Returns zero on success; non-zero on error condition
+ */
+static int
+ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
+				struct dentry *dentry, int mode,
+				struct nameidata *nd)
+{
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+	struct dentry *dentry_save;
+	struct vfsmount *vfsmount_save;
+	int rc;
+
+	dentry_save = nd->dentry;
+	vfsmount_save = nd->mnt;
+	nd->dentry = lower_dentry;
+	nd->mnt = lower_mnt;
+	rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
+	nd->dentry = dentry_save;
+	nd->mnt = vfsmount_save;
+	return rc;
+}
+
+/**
+ * ecryptfs_do_create
+ * @directory_inode: inode of the new file's dentry's parent in ecryptfs
+ * @ecryptfs_dentry: New file's dentry in ecryptfs
+ * @mode: The mode of the new file
+ * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
+ *
+ * Creates the underlying file and the eCryptfs inode which will link to
+ * it. It will also update the eCryptfs directory inode to mimic the
+ * stat of the lower directory inode.
+ *
+ * Returns zero on success; non-zero on error condition
+ */
+static int
+ecryptfs_do_create(struct inode *directory_inode,
+		   struct dentry *ecryptfs_dentry, int mode,
+		   struct nameidata *nd)
+{
+	int rc;
+	struct dentry *lower_dentry;
+	struct dentry *lower_dir_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+	lower_dir_dentry = lock_parent(lower_dentry);
+	if (unlikely(IS_ERR(lower_dir_dentry))) {
+		ecryptfs_printk(KERN_ERR, "Error locking directory of "
+				"dentry\n");
+		rc = PTR_ERR(lower_dir_dentry);
+		goto out;
+	}
+	rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode,
+					     ecryptfs_dentry, mode, nd);
+	if (unlikely(rc)) {
+		ecryptfs_printk(KERN_ERR,
+				"Failure to create underlying file\n");
+		goto out_lock;
+	}
+	rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
+				directory_inode->i_sb, 0);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n");
+		goto out_lock;
+	}
+	ecryptfs_copy_attr_timesizes(directory_inode,
+				     lower_dir_dentry->d_inode);
+out_lock:
+	unlock_dir(lower_dir_dentry);
+out:
+	return rc;
+}
+
+/**
+ * grow_file
+ * @ecryptfs_dentry: the ecryptfs dentry
+ * @lower_file: The lower file
+ * @inode: The ecryptfs inode
+ * @lower_inode: The lower inode
+ *
+ * This is the code which will grow the file to its correct size.
+ */
+static int grow_file(struct dentry *ecryptfs_dentry, struct file *lower_file,
+		     struct inode *inode, struct inode *lower_inode)
+{
+	int rc = 0;
+	struct file fake_file;
+	struct ecryptfs_file_info tmp_file_info;
+
+	memset(&fake_file, 0, sizeof(fake_file));
+	fake_file.f_dentry = ecryptfs_dentry;
+	memset(&tmp_file_info, 0, sizeof(tmp_file_info));
+	ecryptfs_set_file_private(&fake_file, &tmp_file_info);
+	ecryptfs_set_file_lower(&fake_file, lower_file);
+	rc = ecryptfs_fill_zeros(&fake_file, 1);
+	if (rc) {
+		ECRYPTFS_SET_FLAG(
+			ecryptfs_inode_to_private(inode)->crypt_stat.flags,
+			ECRYPTFS_SECURITY_WARNING);
+		ecryptfs_printk(KERN_WARNING, "Error attempting to fill zeros "
+				"in file; rc = [%d]\n", rc);
+		goto out;
+	}
+	i_size_write(inode, 0);
+	ecryptfs_write_inode_size_to_header(lower_file, lower_inode, inode);
+	ECRYPTFS_SET_FLAG(ecryptfs_inode_to_private(inode)->crypt_stat.flags,
+			  ECRYPTFS_NEW_FILE);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_initialize_file
+ *
+ * Cause the file to be changed from a basic empty file to an ecryptfs
+ * file with a header and first data page.
+ *
+ * Returns zero on success
+ */
+static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
+{
+	int rc = 0;
+	int lower_flags;
+	struct ecryptfs_crypt_stat *crypt_stat;
+	struct dentry *lower_dentry;
+	struct dentry *tlower_dentry = NULL;
+	struct file *lower_file;
+	struct inode *inode, *lower_inode;
+	struct vfsmount *lower_mnt;
+
+	lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
+	ecryptfs_printk(KERN_DEBUG, "lower_dentry->d_name.name = [%s]\n",
+			lower_dentry->d_name.name);
+	inode = ecryptfs_dentry->d_inode;
+	crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+	tlower_dentry = dget(lower_dentry);
+	if (!tlower_dentry) {
+		rc = -ENOMEM;
+		ecryptfs_printk(KERN_ERR, "Error dget'ing lower_dentry\n");
+		goto out;
+	}
+	lower_flags = ((O_CREAT | O_WRONLY | O_TRUNC) & O_ACCMODE) | O_RDWR;
+#if BITS_PER_LONG != 32
+	lower_flags |= O_LARGEFILE;
+#endif
+	lower_mnt = ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry);
+	mntget(lower_mnt);
+	/* Corresponding fput() at end of this function */
+	lower_file = dentry_open(tlower_dentry, lower_mnt, lower_flags);
+	if (IS_ERR(lower_file)) {
+		rc = PTR_ERR(lower_file);
+		ecryptfs_printk(KERN_ERR,
+				"Error opening dentry; rc = [%i]\n", rc);
+		goto out;
+	}
+	/* fput(lower_file) should handle the puts if we do this */
+	lower_file->f_dentry = tlower_dentry;
+	lower_file->f_vfsmnt = lower_mnt;
+	lower_inode = tlower_dentry->d_inode;
+	if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
+		ecryptfs_printk(KERN_DEBUG, "This is a directory\n");
+		ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED);
+		goto out_fput;
+	}
+	ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE);
+	ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n");
+	rc = ecryptfs_new_file_context(ecryptfs_dentry);
+	if (rc) {
+		ecryptfs_printk(KERN_DEBUG, "Error creating new file "
+				"context\n");
+		goto out_fput;
+	}
+	rc = ecryptfs_write_headers(ecryptfs_dentry, lower_file);
+	if (rc) {
+		ecryptfs_printk(KERN_DEBUG, "Error writing headers\n");
+		goto out_fput;
+	}
+	rc = grow_file(ecryptfs_dentry, lower_file, inode, lower_inode);
+out_fput:
+	fput(lower_file);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_create
+ * @dir: The inode of the directory in which to create the file.
+ * @dentry: The eCryptfs dentry
+ * @mode: The mode of the new file.
+ * @nd: nameidata
+ *
+ * Creates a new file.
+ *
+ * Returns zero on success; non-zero on error condition
+ */
+static int
+ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
+		int mode, struct nameidata *nd)
+{
+	int rc;
+
+	rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd);
+	if (unlikely(rc)) {
+		ecryptfs_printk(KERN_WARNING, "Failed to create file in"
+				"lower filesystem\n");
+		goto out;
+	}
+	/* At this point, a file exists on "disk"; we need to make sure
+	 * that this on disk file is prepared to be an ecryptfs file */
+	rc = ecryptfs_initialize_file(ecryptfs_dentry);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_lookup
+ * @dir: inode
+ * @dentry: The dentry
+ * @nd: nameidata, may be NULL
+ *
+ * Find a file on disk. If the file does not exist, then we'll add it to the
+ * dentry cache and continue on to read it from the disk.
+ */
+static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
+				      struct nameidata *nd)
+{
+	int rc = 0;
+	struct dentry *lower_dir_dentry;
+	struct dentry *lower_dentry;
+	struct vfsmount *lower_mnt;
+	struct dentry *tlower_dentry = NULL;
+	char *encoded_name;
+	unsigned int encoded_namelen;
+	struct ecryptfs_crypt_stat *crypt_stat = NULL;
+	char *page_virt = NULL;
+	struct inode *lower_inode;
+	u64 file_size;
+
+	lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
+	dentry->d_op = &ecryptfs_dops;
+	if ((dentry->d_name.len == 1 && !strcmp(dentry->d_name.name, "."))
+	    || (dentry->d_name.len == 2 && !strcmp(dentry->d_name.name, "..")))
+		goto out_drop;
+	encoded_namelen = ecryptfs_encode_filename(crypt_stat,
+						   dentry->d_name.name,
+						   dentry->d_name.len,
+						   &encoded_name);
+	if (encoded_namelen < 0) {
+		rc = encoded_namelen;
+		goto out_drop;
+	}
+	ecryptfs_printk(KERN_DEBUG, "encoded_name = [%s]; encoded_namelen "
+			"= [%d]\n", encoded_name, encoded_namelen);
+	lower_dentry = lookup_one_len(encoded_name, lower_dir_dentry,
+				      encoded_namelen - 1);
+	kfree(encoded_name);
+	lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
+	if (IS_ERR(lower_dentry)) {
+		ecryptfs_printk(KERN_ERR, "ERR from lower_dentry\n");
+		rc = PTR_ERR(lower_dentry);
+		goto out_drop;
+	}
+	ecryptfs_printk(KERN_DEBUG, "lower_dentry = [%p]; lower_dentry->"
+       		"d_name.name = [%s]\n", lower_dentry,
+		lower_dentry->d_name.name);
+	lower_inode = lower_dentry->d_inode;
+	ecryptfs_copy_attr_atime(dir, lower_dir_dentry->d_inode);
+	BUG_ON(!atomic_read(&lower_dentry->d_count));
+	ecryptfs_set_dentry_private(dentry,
+				    kmem_cache_alloc(ecryptfs_dentry_info_cache,
+						     SLAB_KERNEL));
+	if (!ecryptfs_dentry_to_private(dentry)) {
+		rc = -ENOMEM;
+		ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting "
+				"to allocate ecryptfs_dentry_info struct\n");
+		goto out_dput;
+	}
+	ecryptfs_set_dentry_lower(dentry, lower_dentry);
+	ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
+	if (!lower_dentry->d_inode) {
+		/* We want to add because we couldn't find in lower */
+		d_add(dentry, NULL);
+		goto out;
+	}
+	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 1);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error interposing\n");
+		goto out_dput;
+	}
+	if (S_ISDIR(lower_inode->i_mode)) {
+		ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n");
+		goto out;
+	}
+	if (S_ISLNK(lower_inode->i_mode)) {
+		ecryptfs_printk(KERN_DEBUG, "Is a symlink; returning\n");
+		goto out;
+	}
+	if (!nd) {
+		ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave"
+				"as we *think* we are about to unlink\n");
+		goto out;
+	}
+	tlower_dentry = dget(lower_dentry);
+	if (!tlower_dentry || IS_ERR(tlower_dentry)) {
+		rc = -ENOMEM;
+		ecryptfs_printk(KERN_ERR, "Cannot dget lower_dentry\n");
+		goto out_dput;
+	}
+	/* Released in this function */
+	page_virt =
+	    (char *)kmem_cache_alloc(ecryptfs_header_cache_2,
+				     SLAB_USER);
+	if (!page_virt) {
+		rc = -ENOMEM;
+		ecryptfs_printk(KERN_ERR,
+				"Cannot ecryptfs_kmalloc a page\n");
+		goto out_dput;
+	}
+	memset(page_virt, 0, PAGE_CACHE_SIZE);
+	rc = ecryptfs_read_header_region(page_virt, tlower_dentry, nd->mnt);
+	crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+	if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_POLICY_APPLIED))
+		ecryptfs_set_default_sizes(crypt_stat);
+	if (rc) {
+		rc = 0;
+		ecryptfs_printk(KERN_WARNING, "Error reading header region;"
+				" assuming unencrypted\n");
+	} else {
+		if (!contains_ecryptfs_marker(page_virt
+					      + ECRYPTFS_FILE_SIZE_BYTES)) {
+			kmem_cache_free(ecryptfs_header_cache_2, page_virt);
+			goto out;
+		}
+		memcpy(&file_size, page_virt, sizeof(file_size));
+		file_size = be64_to_cpu(file_size);
+		i_size_write(dentry->d_inode, (loff_t)file_size);
+	}
+	kmem_cache_free(ecryptfs_header_cache_2, page_virt);
+	goto out;
+
+out_dput:
+	dput(lower_dentry);
+	if (tlower_dentry)
+		dput(tlower_dentry);
+out_drop:
+	d_drop(dentry);
+out:
+	return ERR_PTR(rc);
+}
+
+static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
+			 struct dentry *new_dentry)
+{
+	struct dentry *lower_old_dentry;
+	struct dentry *lower_new_dentry;
+	struct dentry *lower_dir_dentry;
+	u64 file_size_save;
+	int rc;
+
+	file_size_save = i_size_read(old_dentry->d_inode);
+	lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
+	lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
+	dget(lower_old_dentry);
+	dget(lower_new_dentry);
+	lower_dir_dentry = lock_parent(lower_new_dentry);
+	rc = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode,
+		      lower_new_dentry);
+	if (rc || !lower_new_dentry->d_inode)
+		goto out_lock;
+	rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0);
+	if (rc)
+		goto out_lock;
+	ecryptfs_copy_attr_timesizes(dir, lower_new_dentry->d_inode);
+	old_dentry->d_inode->i_nlink =
+		ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink;
+	i_size_write(new_dentry->d_inode, file_size_save);
+out_lock:
+	unlock_dir(lower_dir_dentry);
+	dput(lower_new_dentry);
+	dput(lower_old_dentry);
+	if (!new_dentry->d_inode)
+		d_drop(new_dentry);
+	return rc;
+}
+
+static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int rc = 0;
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
+
+	lock_parent(lower_dentry);
+	rc = vfs_unlink(lower_dir_inode, lower_dentry);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error in vfs_unlink\n");
+		goto out_unlock;
+	}
+	ecryptfs_copy_attr_times(dir, lower_dir_inode);
+	dentry->d_inode->i_nlink =
+		ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink;
+	dentry->d_inode->i_ctime = dir->i_ctime;
+out_unlock:
+	unlock_parent(lower_dentry);
+	return rc;
+}
+
+static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
+			    const char *symname)
+{
+	int rc;
+	struct dentry *lower_dentry;
+	struct dentry *lower_dir_dentry;
+	umode_t mode;
+	char *encoded_symname;
+	unsigned int encoded_symlen;
+	struct ecryptfs_crypt_stat *crypt_stat = NULL;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	dget(lower_dentry);
+	lower_dir_dentry = lock_parent(lower_dentry);
+	mode = S_IALLUGO;
+	encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname,
+						  strlen(symname),
+						  &encoded_symname);
+	if (encoded_symlen < 0) {
+		rc = encoded_symlen;
+		goto out_lock;
+	}
+	rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry,
+			 encoded_symname, mode);
+	kfree(encoded_symname);
+	if (rc || !lower_dentry->d_inode)
+		goto out_lock;
+	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+	if (rc)
+		goto out_lock;
+	ecryptfs_copy_attr_timesizes(dir, lower_dir_dentry->d_inode);
+out_lock:
+	unlock_dir(lower_dir_dentry);
+	dput(lower_dentry);
+	if (!dentry->d_inode)
+		d_drop(dentry);
+	return rc;
+}
+
+static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	int rc;
+	struct dentry *lower_dentry;
+	struct dentry *lower_dir_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	lower_dir_dentry = lock_parent(lower_dentry);
+	rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode);
+	if (rc || !lower_dentry->d_inode)
+		goto out;
+	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+	if (rc)
+		goto out;
+	ecryptfs_copy_attr_timesizes(dir, lower_dir_dentry->d_inode);
+	dir->i_nlink = lower_dir_dentry->d_inode->i_nlink;
+out:
+	unlock_dir(lower_dir_dentry);
+	if (!dentry->d_inode)
+		d_drop(dentry);
+	return rc;
+}
+
+static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int rc = 0;
+	struct dentry *tdentry = NULL;
+	struct dentry *lower_dentry;
+	struct dentry *tlower_dentry = NULL;
+	struct dentry *lower_dir_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	if (!(tdentry = dget(dentry))) {
+		rc = -EINVAL;
+		ecryptfs_printk(KERN_ERR, "Error dget'ing dentry [%p]\n",
+				dentry);
+		goto out;
+	}
+	lower_dir_dentry = lock_parent(lower_dentry);
+	if (!(tlower_dentry = dget(lower_dentry))) {
+		rc = -EINVAL;
+		ecryptfs_printk(KERN_ERR, "Error dget'ing lower_dentry "
+				"[%p]\n", lower_dentry);
+		goto out;
+	}
+	rc = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry);
+	if (!rc) {
+		d_delete(tlower_dentry);
+		tlower_dentry = NULL;
+	}
+	ecryptfs_copy_attr_times(dir, lower_dir_dentry->d_inode);
+	dir->i_nlink = lower_dir_dentry->d_inode->i_nlink;
+	unlock_dir(lower_dir_dentry);
+	if (!rc)
+		d_drop(dentry);
+out:
+	if (tdentry)
+		dput(tdentry);
+	if (tlower_dentry)
+		dput(tlower_dentry);
+	return rc;
+}
+
+static int
+ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+	int rc;
+	struct dentry *lower_dentry;
+	struct dentry *lower_dir_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	lower_dir_dentry = lock_parent(lower_dentry);
+	rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev);
+	if (rc || !lower_dentry->d_inode)
+		goto out;
+	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+	if (rc)
+		goto out;
+	ecryptfs_copy_attr_timesizes(dir, lower_dir_dentry->d_inode);
+out:
+	unlock_dir(lower_dir_dentry);
+	if (!dentry->d_inode)
+		d_drop(dentry);
+	return rc;
+}
+
+static int
+ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	int rc;
+	struct dentry *lower_old_dentry;
+	struct dentry *lower_new_dentry;
+	struct dentry *lower_old_dir_dentry;
+	struct dentry *lower_new_dir_dentry;
+
+	lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
+	lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
+	dget(lower_old_dentry);
+	dget(lower_new_dentry);
+	lower_old_dir_dentry = dget_parent(lower_old_dentry);
+	lower_new_dir_dentry = dget_parent(lower_new_dentry);
+	lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+	rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry,
+			lower_new_dir_dentry->d_inode, lower_new_dentry);
+	if (rc)
+		goto out_lock;
+	ecryptfs_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode);
+	if (new_dir != old_dir)
+		ecryptfs_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode);
+out_lock:
+	unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+	dput(lower_new_dentry);
+	dput(lower_old_dentry);
+	return rc;
+}
+
+static int
+ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz)
+{
+	int rc;
+	struct dentry *lower_dentry;
+	char *decoded_name;
+	char *lower_buf;
+	mm_segment_t old_fs;
+	struct ecryptfs_crypt_stat *crypt_stat;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	if (!lower_dentry->d_inode->i_op ||
+	    !lower_dentry->d_inode->i_op->readlink) {
+		rc = -EINVAL;
+		goto out;
+	}
+	/* Released in this function */
+	lower_buf = kmalloc(bufsiz, GFP_KERNEL);
+	if (lower_buf == NULL) {
+		ecryptfs_printk(KERN_ERR, "Out of memory\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	old_fs = get_fs();
+	set_fs(get_ds());
+	ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
+			"lower_dentry->d_name.name = [%s]\n",
+			lower_dentry->d_name.name);
+	rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
+						   (char __user *)lower_buf,
+						   bufsiz);
+	set_fs(old_fs);
+	if (rc >= 0) {
+		crypt_stat = NULL;
+		rc = ecryptfs_decode_filename(crypt_stat, lower_buf, rc,
+					      &decoded_name);
+		if (rc == -ENOMEM)
+			goto out_free_lower_buf;
+		if (rc > 0) {
+			ecryptfs_printk(KERN_DEBUG, "Copying [%d] bytes "
+					"to userspace: [%*s]\n", rc,
+					decoded_name);
+			if (copy_to_user(buf, decoded_name, rc))
+				rc = -EFAULT;
+		}
+		kfree(decoded_name);
+		ecryptfs_copy_attr_atime(dentry->d_inode,
+					 lower_dentry->d_inode);
+	}
+out_free_lower_buf:
+	kfree(lower_buf);
+out:
+	return rc;
+}
+
+static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	char *buf;
+	int len = PAGE_SIZE, rc;
+	mm_segment_t old_fs;
+
+	/* Released in ecryptfs_put_link(); only release here on error */
+	buf = kmalloc(len, GFP_KERNEL);
+	if (!buf) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	old_fs = get_fs();
+	set_fs(get_ds());
+	ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
+			"dentry->d_name.name = [%s]\n", dentry->d_name.name);
+	rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
+	buf[rc] = '\0';
+	set_fs(old_fs);
+	if (rc < 0)
+		goto out_free;
+	rc = 0;
+	nd_set_link(nd, buf);
+	goto out;
+out_free:
+	kfree(buf);
+out:
+	return ERR_PTR(rc);
+}
+
+static void
+ecryptfs_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr)
+{
+	/* Free the char* */
+	kfree(nd_get_link(nd));
+}
+
+/**
+ * upper_size_to_lower_size
+ * @crypt_stat: Crypt_stat associated with file
+ * @upper_size: Size of the upper file
+ *
+ * Calculate the requried size of the lower file based on the
+ * specified size of the upper file. This calculation is based on the
+ * number of headers in the underlying file and the extent size.
+ *
+ * Returns Calculated size of the lower file.
+ */
+static loff_t
+upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
+			 loff_t upper_size)
+{
+	loff_t lower_size;
+
+	lower_size = ( crypt_stat->header_extent_size
+		       * crypt_stat->num_header_extents_at_front );
+	if (upper_size != 0) {
+		loff_t num_extents;
+
+		num_extents = upper_size >> crypt_stat->extent_shift;
+		if (upper_size & ~crypt_stat->extent_mask)
+			num_extents++;
+		lower_size += (num_extents * crypt_stat->extent_size);
+	}
+	return lower_size;
+}
+
+/**
+ * ecryptfs_truncate
+ * @dentry: The ecryptfs layer dentry
+ * @new_length: The length to expand the file to
+ *
+ * Function to handle truncations modifying the size of the file. Note
+ * that the file sizes are interpolated. When expanding, we are simply
+ * writing strings of 0's out. When truncating, we need to modify the
+ * underlying file size according to the page index interpolations.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
+{
+	int rc = 0;
+	struct inode *inode = dentry->d_inode;
+	struct dentry *lower_dentry;
+	struct vfsmount *lower_mnt;
+	struct file fake_ecryptfs_file, *lower_file = NULL;
+	struct ecryptfs_crypt_stat *crypt_stat;
+	loff_t i_size = i_size_read(inode);
+	loff_t lower_size_before_truncate;
+	loff_t lower_size_after_truncate;
+
+	if (unlikely((new_length == i_size)))
+		goto out;
+	crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+	/* Set up a fake ecryptfs file, this is used to interface with
+	 * the file in the underlying filesystem so that the
+	 * truncation has an effect there as well. */
+	memset(&fake_ecryptfs_file, 0, sizeof(fake_ecryptfs_file));
+	fake_ecryptfs_file.f_dentry = dentry;
+	/* Released at out_free: label */
+	ecryptfs_set_file_private(&fake_ecryptfs_file,
+				  kmem_cache_alloc(ecryptfs_file_info_cache,
+						   SLAB_KERNEL));
+	if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	/* This dget & mntget is released through fput at out_fput: */
+	dget(lower_dentry);
+	lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+	mntget(lower_mnt);
+	lower_file = dentry_open(lower_dentry, lower_mnt, O_RDWR);
+	if (unlikely(IS_ERR(lower_file))) {
+		rc = PTR_ERR(lower_file);
+		goto out_free;
+	}
+	ecryptfs_set_file_lower(&fake_ecryptfs_file, lower_file);
+	/* Switch on growing or shrinking file */
+	if (new_length > i_size) {
+		rc = ecryptfs_fill_zeros(&fake_ecryptfs_file, new_length);
+		if (rc) {
+			ecryptfs_printk(KERN_ERR,
+					"Problem with fill_zeros\n");
+			goto out_fput;
+		}
+		i_size_write(inode, new_length);
+		rc = ecryptfs_write_inode_size_to_header(lower_file,
+							 lower_dentry->d_inode,
+							 inode);
+		if (rc) {
+			ecryptfs_printk(KERN_ERR,
+					"Problem with ecryptfs_write"
+					"_inode_size\n");
+			goto out_fput;
+		}
+	} else { /* new_length < i_size_read(inode) */
+		vmtruncate(inode, new_length);
+		ecryptfs_write_inode_size_to_header(lower_file,
+						    lower_dentry->d_inode,
+						    inode);
+		/* We are reducing the size of the ecryptfs file, and need to
+		 * know if we need to reduce the size of the lower file. */
+		lower_size_before_truncate =
+		    upper_size_to_lower_size(crypt_stat, i_size);
+		lower_size_after_truncate =
+		    upper_size_to_lower_size(crypt_stat, new_length);
+		if (lower_size_after_truncate < lower_size_before_truncate)
+			vmtruncate(lower_dentry->d_inode,
+				   lower_size_after_truncate);
+	}
+	/* Update the access times */
+	lower_dentry->d_inode->i_mtime = lower_dentry->d_inode->i_ctime
+		= CURRENT_TIME;
+	mark_inode_dirty_sync(inode);
+out_fput:
+	fput(lower_file);
+out_free:
+	if (ecryptfs_file_to_private(&fake_ecryptfs_file))
+		kmem_cache_free(ecryptfs_file_info_cache,
+				ecryptfs_file_to_private(&fake_ecryptfs_file));
+out:
+	return rc;
+}
+
+static int
+ecryptfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	int rc;
+
+        if (nd) {
+		struct vfsmount *vfsmnt_save = nd->mnt;
+		struct dentry *dentry_save = nd->dentry;
+
+		nd->mnt = ecryptfs_dentry_to_lower_mnt(nd->dentry);
+		nd->dentry = ecryptfs_dentry_to_lower(nd->dentry);
+		rc = permission(ecryptfs_inode_to_lower(inode), mask, nd);
+		nd->mnt = vfsmnt_save;
+		nd->dentry = dentry_save;
+        } else
+		rc = permission(ecryptfs_inode_to_lower(inode), mask, NULL);
+        return rc;
+}
+
+/**
+ * ecryptfs_setattr
+ * @dentry: dentry handle to the inode to modify
+ * @ia: Structure with flags of what to change and values
+ *
+ * Updates the metadata of an inode. If the update is to the size
+ * i.e. truncation, then ecryptfs_truncate will handle the size modification
+ * of both the ecryptfs inode and the lower inode.
+ *
+ * All other metadata changes will be passed right to the lower filesystem,
+ * and we will just update our inode to look like the lower.
+ */
+static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
+{
+	int rc = 0;
+	struct dentry *lower_dentry;
+	struct inode *inode;
+	struct inode *lower_inode;
+	struct ecryptfs_crypt_stat *crypt_stat;
+
+	crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	inode = dentry->d_inode;
+	lower_inode = ecryptfs_inode_to_lower(inode);
+	if (ia->ia_valid & ATTR_SIZE) {
+		ecryptfs_printk(KERN_DEBUG,
+				"ia->ia_valid = [0x%x] ATTR_SIZE" " = [0x%x]\n",
+				ia->ia_valid, ATTR_SIZE);
+		rc = ecryptfs_truncate(dentry, ia->ia_size);
+		/* ecryptfs_truncate handles resizing of the lower file */
+		ia->ia_valid &= ~ATTR_SIZE;
+		ecryptfs_printk(KERN_DEBUG, "ia->ia_valid = [%x]\n",
+				ia->ia_valid);
+		if (rc < 0)
+			goto out;
+	}
+	rc = notify_change(lower_dentry, ia);
+out:
+	ecryptfs_copy_attr_all(inode, lower_inode);
+	return rc;
+}
+
+static int
+ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		  size_t size, int flags)
+{
+	int rc = 0;
+	struct dentry *lower_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	if (!lower_dentry->d_inode->i_op->setxattr) {
+		rc = -ENOSYS;
+		goto out;
+	}
+	mutex_lock(&lower_dentry->d_inode->i_mutex);
+	rc = lower_dentry->d_inode->i_op->setxattr(lower_dentry, name, value,
+						   size, flags);
+	mutex_unlock(&lower_dentry->d_inode->i_mutex);
+out:
+	return rc;
+}
+
+static ssize_t
+ecryptfs_getxattr(struct dentry *dentry, const char *name, void *value,
+		  size_t size)
+{
+	int rc = 0;
+	struct dentry *lower_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	if (!lower_dentry->d_inode->i_op->getxattr) {
+		rc = -ENOSYS;
+		goto out;
+	}
+	mutex_lock(&lower_dentry->d_inode->i_mutex);
+	rc = lower_dentry->d_inode->i_op->getxattr(lower_dentry, name, value,
+						   size);
+	mutex_unlock(&lower_dentry->d_inode->i_mutex);
+out:
+	return rc;
+}
+
+static ssize_t
+ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+	int rc = 0;
+	struct dentry *lower_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	if (!lower_dentry->d_inode->i_op->listxattr) {
+		rc = -ENOSYS;
+		goto out;
+	}
+	mutex_lock(&lower_dentry->d_inode->i_mutex);
+	rc = lower_dentry->d_inode->i_op->listxattr(lower_dentry, list, size);
+	mutex_unlock(&lower_dentry->d_inode->i_mutex);
+out:
+	return rc;
+}
+
+static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
+{
+	int rc = 0;
+	struct dentry *lower_dentry;
+
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	if (!lower_dentry->d_inode->i_op->removexattr) {
+		rc = -ENOSYS;
+		goto out;
+	}
+	mutex_lock(&lower_dentry->d_inode->i_mutex);
+	rc = lower_dentry->d_inode->i_op->removexattr(lower_dentry, name);
+	mutex_unlock(&lower_dentry->d_inode->i_mutex);
+out:
+	return rc;
+}
+
+int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode)
+{
+	if ((ecryptfs_inode_to_lower(inode)
+	     == (struct inode *)candidate_lower_inode))
+		return 1;
+	else
+		return 0;
+}
+
+int ecryptfs_inode_set(struct inode *inode, void *lower_inode)
+{
+	ecryptfs_init_inode(inode, (struct inode *)lower_inode);
+	return 0;
+}
+
+struct inode_operations ecryptfs_symlink_iops = {
+	.readlink = ecryptfs_readlink,
+	.follow_link = ecryptfs_follow_link,
+	.put_link = ecryptfs_put_link,
+	.permission = ecryptfs_permission,
+	.setattr = ecryptfs_setattr,
+	.setxattr = ecryptfs_setxattr,
+	.getxattr = ecryptfs_getxattr,
+	.listxattr = ecryptfs_listxattr,
+	.removexattr = ecryptfs_removexattr
+};
+
+struct inode_operations ecryptfs_dir_iops = {
+	.create = ecryptfs_create,
+	.lookup = ecryptfs_lookup,
+	.link = ecryptfs_link,
+	.unlink = ecryptfs_unlink,
+	.symlink = ecryptfs_symlink,
+	.mkdir = ecryptfs_mkdir,
+	.rmdir = ecryptfs_rmdir,
+	.mknod = ecryptfs_mknod,
+	.rename = ecryptfs_rename,
+	.permission = ecryptfs_permission,
+	.setattr = ecryptfs_setattr,
+	.setxattr = ecryptfs_setxattr,
+	.getxattr = ecryptfs_getxattr,
+	.listxattr = ecryptfs_listxattr,
+	.removexattr = ecryptfs_removexattr
+};
+
+struct inode_operations ecryptfs_main_iops = {
+	.permission = ecryptfs_permission,
+	.setattr = ecryptfs_setattr,
+	.setxattr = ecryptfs_setxattr,
+	.getxattr = ecryptfs_getxattr,
+	.listxattr = ecryptfs_listxattr,
+	.removexattr = ecryptfs_removexattr
+};
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
new file mode 100644
index 000000000000..ba454785a0c5
--- /dev/null
+++ b/fs/ecryptfs/keystore.c
@@ -0,0 +1,1061 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * In-kernel key management code.  Includes functions to parse and
+ * write authentication token-related packets with the underlying
+ * file.
+ *
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com>
+ *              Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/pagemap.h>
+#include <linux/key.h>
+#include <linux/random.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * request_key returned an error instead of a valid key address;
+ * determine the type of error, make appropriate log entries, and
+ * return an error code.
+ */
+int process_request_key_err(long err_code)
+{
+	int rc = 0;
+
+	switch (err_code) {
+	case ENOKEY:
+		ecryptfs_printk(KERN_WARNING, "No key\n");
+		rc = -ENOENT;
+		break;
+	case EKEYEXPIRED:
+		ecryptfs_printk(KERN_WARNING, "Key expired\n");
+		rc = -ETIME;
+		break;
+	case EKEYREVOKED:
+		ecryptfs_printk(KERN_WARNING, "Key revoked\n");
+		rc = -EINVAL;
+		break;
+	default:
+		ecryptfs_printk(KERN_WARNING, "Unknown error code: "
+				"[0x%.16x]\n", err_code);
+		rc = -EINVAL;
+	}
+	return rc;
+}
+
+static void wipe_auth_tok_list(struct list_head *auth_tok_list_head)
+{
+	struct list_head *walker;
+	struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
+
+	walker = auth_tok_list_head->next;
+	while (walker != auth_tok_list_head) {
+		auth_tok_list_item =
+		    list_entry(walker, struct ecryptfs_auth_tok_list_item,
+			       list);
+		walker = auth_tok_list_item->list.next;
+		memset(auth_tok_list_item, 0,
+		       sizeof(struct ecryptfs_auth_tok_list_item));
+		kmem_cache_free(ecryptfs_auth_tok_list_item_cache,
+				auth_tok_list_item);
+	}
+}
+
+struct kmem_cache *ecryptfs_auth_tok_list_item_cache;
+
+/**
+ * parse_packet_length
+ * @data: Pointer to memory containing length at offset
+ * @size: This function writes the decoded size to this memory
+ *        address; zero on error
+ * @length_size: The number of bytes occupied by the encoded length
+ *
+ * Returns Zero on success
+ */
+static int parse_packet_length(unsigned char *data, size_t *size,
+			       size_t *length_size)
+{
+	int rc = 0;
+
+	(*length_size) = 0;
+	(*size) = 0;
+	if (data[0] < 192) {
+		/* One-byte length */
+		(*size) = data[0];
+		(*length_size) = 1;
+	} else if (data[0] < 224) {
+		/* Two-byte length */
+		(*size) = ((data[0] - 192) * 256);
+		(*size) += (data[1] + 192);
+		(*length_size) = 2;
+	} else if (data[0] == 255) {
+		/* Five-byte length; we're not supposed to see this */
+		ecryptfs_printk(KERN_ERR, "Five-byte packet length not "
+				"supported\n");
+		rc = -EINVAL;
+		goto out;
+	} else {
+		ecryptfs_printk(KERN_ERR, "Error parsing packet length\n");
+		rc = -EINVAL;
+		goto out;
+	}
+out:
+	return rc;
+}
+
+/**
+ * write_packet_length
+ * @dest: The byte array target into which to write the
+ *       length. Must have at least 5 bytes allocated.
+ * @size: The length to write.
+ * @packet_size_length: The number of bytes used to encode the
+ *                      packet length is written to this address.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int write_packet_length(char *dest, size_t size,
+			       size_t *packet_size_length)
+{
+	int rc = 0;
+
+	if (size < 192) {
+		dest[0] = size;
+		(*packet_size_length) = 1;
+	} else if (size < 65536) {
+		dest[0] = (((size - 192) / 256) + 192);
+		dest[1] = ((size - 192) % 256);
+		(*packet_size_length) = 2;
+	} else {
+		rc = -EINVAL;
+		ecryptfs_printk(KERN_WARNING,
+				"Unsupported packet size: [%d]\n", size);
+	}
+	return rc;
+}
+
+/**
+ * parse_tag_3_packet
+ * @crypt_stat: The cryptographic context to modify based on packet
+ *              contents.
+ * @data: The raw bytes of the packet.
+ * @auth_tok_list: eCryptfs parses packets into authentication tokens;
+ *                 a new authentication token will be placed at the end
+ *                 of this list for this packet.
+ * @new_auth_tok: Pointer to a pointer to memory that this function
+ *                allocates; sets the memory address of the pointer to
+ *                NULL on error. This object is added to the
+ *                auth_tok_list.
+ * @packet_size: This function writes the size of the parsed packet
+ *               into this memory location; zero on error.
+ * @max_packet_size: maximum number of bytes to parse
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
+		   unsigned char *data, struct list_head *auth_tok_list,
+		   struct ecryptfs_auth_tok **new_auth_tok,
+		   size_t *packet_size, size_t max_packet_size)
+{
+	int rc = 0;
+	size_t body_size;
+	struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
+	size_t length_size;
+
+	(*packet_size) = 0;
+	(*new_auth_tok) = NULL;
+
+	/* we check that:
+	 *   one byte for the Tag 3 ID flag
+	 *   two bytes for the body size
+	 * do not exceed the maximum_packet_size
+	 */
+	if (unlikely((*packet_size) + 3 > max_packet_size)) {
+		ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n");
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/* check for Tag 3 identifyer - one byte */
+	if (data[(*packet_size)++] != ECRYPTFS_TAG_3_PACKET_TYPE) {
+		ecryptfs_printk(KERN_ERR, "Enter w/ first byte != 0x%.2x\n",
+				ECRYPTFS_TAG_3_PACKET_TYPE);
+		rc = -EINVAL;
+		goto out;
+	}
+	/* Released: wipe_auth_tok_list called in ecryptfs_parse_packet_set or
+	 * at end of function upon failure */
+	auth_tok_list_item =
+	    kmem_cache_alloc(ecryptfs_auth_tok_list_item_cache, SLAB_KERNEL);
+	if (!auth_tok_list_item) {
+		ecryptfs_printk(KERN_ERR, "Unable to allocate memory\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	memset(auth_tok_list_item, 0,
+	       sizeof(struct ecryptfs_auth_tok_list_item));
+	(*new_auth_tok) = &auth_tok_list_item->auth_tok;
+
+	/* check for body size - one to two bytes */
+	rc = parse_packet_length(&data[(*packet_size)], &body_size,
+				 &length_size);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error parsing packet length; "
+				"rc = [%d]\n", rc);
+		goto out_free;
+	}
+	if (unlikely(body_size < (0x05 + ECRYPTFS_SALT_SIZE))) {
+		ecryptfs_printk(KERN_WARNING, "Invalid body size ([%d])\n",
+				body_size);
+		rc = -EINVAL;
+		goto out_free;
+	}
+	(*packet_size) += length_size;
+
+	/* now we know the length of the remainting Tag 3 packet size:
+	 *   5 fix bytes for: version string, cipher, S2K ID, hash algo,
+	 *                    number of hash iterations
+	 *   ECRYPTFS_SALT_SIZE bytes for salt
+	 *   body_size bytes minus the stuff above is the encrypted key size
+	 */
+	if (unlikely((*packet_size) + body_size > max_packet_size)) {
+		ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n");
+		rc = -EINVAL;
+		goto out_free;
+	}
+
+	/* There are 5 characters of additional information in the
+	 * packet */
+	(*new_auth_tok)->session_key.encrypted_key_size =
+		body_size - (0x05 + ECRYPTFS_SALT_SIZE);
+	ecryptfs_printk(KERN_DEBUG, "Encrypted key size = [%d]\n",
+			(*new_auth_tok)->session_key.encrypted_key_size);
+
+	/* Version 4 (from RFC2440) - one byte */
+	if (unlikely(data[(*packet_size)++] != 0x04)) {
+		ecryptfs_printk(KERN_DEBUG, "Unknown version number "
+				"[%d]\n", data[(*packet_size) - 1]);
+		rc = -EINVAL;
+		goto out_free;
+	}
+
+	/* cipher - one byte */
+	ecryptfs_cipher_code_to_string(crypt_stat->cipher,
+				       (u16)data[(*packet_size)]);
+	/* A little extra work to differentiate among the AES key
+	 * sizes; see RFC2440 */
+	switch(data[(*packet_size)++]) {
+	case RFC2440_CIPHER_AES_192:
+		crypt_stat->key_size = 24;
+		break;
+	default:
+		crypt_stat->key_size =
+			(*new_auth_tok)->session_key.encrypted_key_size;
+	}
+	ecryptfs_init_crypt_ctx(crypt_stat);
+	/* S2K identifier 3 (from RFC2440) */
+	if (unlikely(data[(*packet_size)++] != 0x03)) {
+		ecryptfs_printk(KERN_ERR, "Only S2K ID 3 is currently "
+				"supported\n");
+		rc = -ENOSYS;
+		goto out_free;
+	}
+
+	/* TODO: finish the hash mapping */
+	/* hash algorithm - one byte */
+	switch (data[(*packet_size)++]) {
+	case 0x01: /* See RFC2440 for these numbers and their mappings */
+		/* Choose MD5 */
+		/* salt - ECRYPTFS_SALT_SIZE bytes */
+		memcpy((*new_auth_tok)->token.password.salt,
+		       &data[(*packet_size)], ECRYPTFS_SALT_SIZE);
+		(*packet_size) += ECRYPTFS_SALT_SIZE;
+
+		/* This conversion was taken straight from RFC2440 */
+		/* number of hash iterations - one byte */
+		(*new_auth_tok)->token.password.hash_iterations =
+			((u32) 16 + (data[(*packet_size)] & 15))
+				<< ((data[(*packet_size)] >> 4) + 6);
+		(*packet_size)++;
+
+		/* encrypted session key -
+		 *   (body_size-5-ECRYPTFS_SALT_SIZE) bytes */
+		memcpy((*new_auth_tok)->session_key.encrypted_key,
+		       &data[(*packet_size)],
+		       (*new_auth_tok)->session_key.encrypted_key_size);
+		(*packet_size) +=
+			(*new_auth_tok)->session_key.encrypted_key_size;
+		(*new_auth_tok)->session_key.flags &=
+			~ECRYPTFS_CONTAINS_DECRYPTED_KEY;
+		(*new_auth_tok)->session_key.flags |=
+			ECRYPTFS_CONTAINS_ENCRYPTED_KEY;
+		(*new_auth_tok)->token.password.hash_algo = 0x01;
+		break;
+	default:
+		ecryptfs_printk(KERN_ERR, "Unsupported hash algorithm: "
+				"[%d]\n", data[(*packet_size) - 1]);
+		rc = -ENOSYS;
+		goto out_free;
+	}
+	(*new_auth_tok)->token_type = ECRYPTFS_PASSWORD;
+	/* TODO: Parametarize; we might actually want userspace to
+	 * decrypt the session key. */
+	ECRYPTFS_CLEAR_FLAG((*new_auth_tok)->session_key.flags,
+			    ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT);
+	ECRYPTFS_CLEAR_FLAG((*new_auth_tok)->session_key.flags,
+			    ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT);
+	list_add(&auth_tok_list_item->list, auth_tok_list);
+	goto out;
+out_free:
+	(*new_auth_tok) = NULL;
+	memset(auth_tok_list_item, 0,
+	       sizeof(struct ecryptfs_auth_tok_list_item));
+	kmem_cache_free(ecryptfs_auth_tok_list_item_cache,
+			auth_tok_list_item);
+out:
+	if (rc)
+		(*packet_size) = 0;
+	return rc;
+}
+
+/**
+ * parse_tag_11_packet
+ * @data: The raw bytes of the packet
+ * @contents: This function writes the data contents of the literal
+ *            packet into this memory location
+ * @max_contents_bytes: The maximum number of bytes that this function
+ *                      is allowed to write into contents
+ * @tag_11_contents_size: This function writes the size of the parsed
+ *                        contents into this memory location; zero on
+ *                        error
+ * @packet_size: This function writes the size of the parsed packet
+ *               into this memory location; zero on error
+ * @max_packet_size: maximum number of bytes to parse
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+parse_tag_11_packet(unsigned char *data, unsigned char *contents,
+		    size_t max_contents_bytes, size_t *tag_11_contents_size,
+		    size_t *packet_size, size_t max_packet_size)
+{
+	int rc = 0;
+	size_t body_size;
+	size_t length_size;
+
+	(*packet_size) = 0;
+	(*tag_11_contents_size) = 0;
+
+	/* check that:
+	 *   one byte for the Tag 11 ID flag
+	 *   two bytes for the Tag 11 length
+	 * do not exceed the maximum_packet_size
+	 */
+	if (unlikely((*packet_size) + 3 > max_packet_size)) {
+		ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n");
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/* check for Tag 11 identifyer - one byte */
+	if (data[(*packet_size)++] != ECRYPTFS_TAG_11_PACKET_TYPE) {
+		ecryptfs_printk(KERN_WARNING,
+				"Invalid tag 11 packet format\n");
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/* get Tag 11 content length - one or two bytes */
+	rc = parse_packet_length(&data[(*packet_size)], &body_size,
+				 &length_size);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING,
+				"Invalid tag 11 packet format\n");
+		goto out;
+	}
+	(*packet_size) += length_size;
+
+	if (body_size < 13) {
+		ecryptfs_printk(KERN_WARNING, "Invalid body size ([%d])\n",
+				body_size);
+		rc = -EINVAL;
+		goto out;
+	}
+	/* We have 13 bytes of surrounding packet values */
+	(*tag_11_contents_size) = (body_size - 13);
+
+	/* now we know the length of the remainting Tag 11 packet size:
+	 *   14 fix bytes for: special flag one, special flag two,
+	 *   		       12 skipped bytes
+	 *   body_size bytes minus the stuff above is the Tag 11 content
+	 */
+	/* FIXME why is the body size one byte smaller than the actual
+	 * size of the body?
+	 * this seems to be an error here as well as in
+	 * write_tag_11_packet() */
+	if (unlikely((*packet_size) + body_size + 1 > max_packet_size)) {
+		ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n");
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/* special flag one - one byte */
+	if (data[(*packet_size)++] != 0x62) {
+		ecryptfs_printk(KERN_WARNING, "Unrecognizable packet\n");
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/* special flag two - one byte */
+	if (data[(*packet_size)++] != 0x08) {
+		ecryptfs_printk(KERN_WARNING, "Unrecognizable packet\n");
+		rc = -EINVAL;
+		goto out;
+	}
+
+	/* skip the next 12 bytes */
+	(*packet_size) += 12; /* We don't care about the filename or
+			       * the timestamp */
+
+	/* get the Tag 11 contents - tag_11_contents_size bytes */
+	memcpy(contents, &data[(*packet_size)], (*tag_11_contents_size));
+	(*packet_size) += (*tag_11_contents_size);
+
+out:
+	if (rc) {
+		(*packet_size) = 0;
+		(*tag_11_contents_size) = 0;
+	}
+	return rc;
+}
+
+/**
+ * decrypt_session_key - Decrypt the session key with the given auth_tok.
+ *
+ * Returns Zero on success; non-zero error otherwise.
+ */
+static int decrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
+			       struct ecryptfs_crypt_stat *crypt_stat)
+{
+	int rc = 0;
+	struct ecryptfs_password *password_s_ptr;
+	struct crypto_tfm *tfm = NULL;
+	struct scatterlist src_sg[2], dst_sg[2];
+	struct mutex *tfm_mutex = NULL;
+	/* TODO: Use virt_to_scatterlist for these */
+	char *encrypted_session_key;
+	char *session_key;
+
+	password_s_ptr = &auth_tok->token.password;
+	if (ECRYPTFS_CHECK_FLAG(password_s_ptr->flags,
+				ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET))
+		ecryptfs_printk(KERN_DEBUG, "Session key encryption key "
+				"set; skipping key generation\n");
+	ecryptfs_printk(KERN_DEBUG, "Session key encryption key (size [%d])"
+			":\n",
+			password_s_ptr->session_key_encryption_key_bytes);
+	if (ecryptfs_verbosity > 0)
+		ecryptfs_dump_hex(password_s_ptr->session_key_encryption_key,
+				  password_s_ptr->
+				  session_key_encryption_key_bytes);
+	if (!strcmp(crypt_stat->cipher,
+		    crypt_stat->mount_crypt_stat->global_default_cipher_name)
+	    && crypt_stat->mount_crypt_stat->global_key_tfm) {
+		tfm = crypt_stat->mount_crypt_stat->global_key_tfm;
+		tfm_mutex = &crypt_stat->mount_crypt_stat->global_key_tfm_mutex;
+	} else {
+		tfm = crypto_alloc_tfm(crypt_stat->cipher,
+				       CRYPTO_TFM_REQ_WEAK_KEY);
+		if (!tfm) {
+			printk(KERN_ERR "Error allocating crypto context\n");
+			rc = -ENOMEM;
+			goto out;
+		}
+	}
+	if (password_s_ptr->session_key_encryption_key_bytes
+	    < crypto_tfm_alg_min_keysize(tfm)) {
+		printk(KERN_WARNING "Session key encryption key is [%d] bytes; "
+		       "minimum keysize for selected cipher is [%d] bytes.\n",
+		       password_s_ptr->session_key_encryption_key_bytes,
+		       crypto_tfm_alg_min_keysize(tfm));
+		rc = -EINVAL;
+		goto out;
+	}
+	if (tfm_mutex)
+		mutex_lock(tfm_mutex);
+	crypto_cipher_setkey(tfm, password_s_ptr->session_key_encryption_key,
+			     crypt_stat->key_size);
+	/* TODO: virt_to_scatterlist */
+	encrypted_session_key = (char *)__get_free_page(GFP_KERNEL);
+	if (!encrypted_session_key) {
+		ecryptfs_printk(KERN_ERR, "Out of memory\n");
+		rc = -ENOMEM;
+		goto out_free_tfm;
+	}
+	session_key = (char *)__get_free_page(GFP_KERNEL);
+	if (!session_key) {
+		kfree(encrypted_session_key);
+		ecryptfs_printk(KERN_ERR, "Out of memory\n");
+		rc = -ENOMEM;
+		goto out_free_tfm;
+	}
+	memcpy(encrypted_session_key, auth_tok->session_key.encrypted_key,
+	       auth_tok->session_key.encrypted_key_size);
+	src_sg[0].page = virt_to_page(encrypted_session_key);
+	src_sg[0].offset = 0;
+	BUG_ON(auth_tok->session_key.encrypted_key_size > PAGE_CACHE_SIZE);
+	src_sg[0].length = auth_tok->session_key.encrypted_key_size;
+	dst_sg[0].page = virt_to_page(session_key);
+	dst_sg[0].offset = 0;
+	auth_tok->session_key.decrypted_key_size =
+	    auth_tok->session_key.encrypted_key_size;
+	dst_sg[0].length = auth_tok->session_key.encrypted_key_size;
+	/* TODO: Handle error condition */
+	crypto_cipher_decrypt(tfm, dst_sg, src_sg,
+			      auth_tok->session_key.encrypted_key_size);
+	auth_tok->session_key.decrypted_key_size =
+	    auth_tok->session_key.encrypted_key_size;
+	memcpy(auth_tok->session_key.decrypted_key, session_key,
+	       auth_tok->session_key.decrypted_key_size);
+	auth_tok->session_key.flags |= ECRYPTFS_CONTAINS_DECRYPTED_KEY;
+	memcpy(crypt_stat->key, auth_tok->session_key.decrypted_key,
+	       auth_tok->session_key.decrypted_key_size);
+	ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID);
+	ecryptfs_printk(KERN_DEBUG, "Decrypted session key:\n");
+	if (ecryptfs_verbosity > 0)
+		ecryptfs_dump_hex(crypt_stat->key,
+				  crypt_stat->key_size);
+	memset(encrypted_session_key, 0, PAGE_CACHE_SIZE);
+	free_page((unsigned long)encrypted_session_key);
+	memset(session_key, 0, PAGE_CACHE_SIZE);
+	free_page((unsigned long)session_key);
+out_free_tfm:
+	if (tfm_mutex)
+		mutex_unlock(tfm_mutex);
+	else
+		crypto_free_tfm(tfm);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_parse_packet_set
+ * @dest: The header page in memory
+ * @version: Version of file format, to guide parsing behavior
+ *
+ * Get crypt_stat to have the file's session key if the requisite key
+ * is available to decrypt the session key.
+ *
+ * Returns Zero if a valid authentication token was retrieved and
+ * processed; negative value for file not encrypted or for error
+ * conditions.
+ */
+int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
+			      unsigned char *src,
+			      struct dentry *ecryptfs_dentry)
+{
+	size_t i = 0;
+	int rc = 0;
+	size_t found_auth_tok = 0;
+	size_t next_packet_is_auth_tok_packet;
+	char sig[ECRYPTFS_SIG_SIZE_HEX];
+	struct list_head auth_tok_list;
+	struct list_head *walker;
+	struct ecryptfs_auth_tok *chosen_auth_tok = NULL;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+		&ecryptfs_superblock_to_private(
+			ecryptfs_dentry->d_sb)->mount_crypt_stat;
+	struct ecryptfs_auth_tok *candidate_auth_tok = NULL;
+	size_t packet_size;
+	struct ecryptfs_auth_tok *new_auth_tok;
+	unsigned char sig_tmp_space[ECRYPTFS_SIG_SIZE];
+	size_t tag_11_contents_size;
+	size_t tag_11_packet_size;
+
+	INIT_LIST_HEAD(&auth_tok_list);
+	/* Parse the header to find as many packets as we can, these will be
+	 * added the our &auth_tok_list */
+	next_packet_is_auth_tok_packet = 1;
+	while (next_packet_is_auth_tok_packet) {
+		size_t max_packet_size = ((PAGE_CACHE_SIZE - 8) - i);
+
+		switch (src[i]) {
+		case ECRYPTFS_TAG_3_PACKET_TYPE:
+			rc = parse_tag_3_packet(crypt_stat,
+						(unsigned char *)&src[i],
+						&auth_tok_list, &new_auth_tok,
+						&packet_size, max_packet_size);
+			if (rc) {
+				ecryptfs_printk(KERN_ERR, "Error parsing "
+						"tag 3 packet\n");
+				rc = -EIO;
+				goto out_wipe_list;
+			}
+			i += packet_size;
+			rc = parse_tag_11_packet((unsigned char *)&src[i],
+						 sig_tmp_space,
+						 ECRYPTFS_SIG_SIZE,
+						 &tag_11_contents_size,
+						 &tag_11_packet_size,
+						 max_packet_size);
+			if (rc) {
+				ecryptfs_printk(KERN_ERR, "No valid "
+						"(ecryptfs-specific) literal "
+						"packet containing "
+						"authentication token "
+						"signature found after "
+						"tag 3 packet\n");
+				rc = -EIO;
+				goto out_wipe_list;
+			}
+			i += tag_11_packet_size;
+			if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) {
+				ecryptfs_printk(KERN_ERR, "Expected "
+						"signature of size [%d]; "
+						"read size [%d]\n",
+						ECRYPTFS_SIG_SIZE,
+						tag_11_contents_size);
+				rc = -EIO;
+				goto out_wipe_list;
+			}
+			ecryptfs_to_hex(new_auth_tok->token.password.signature,
+					sig_tmp_space, tag_11_contents_size);
+			new_auth_tok->token.password.signature[
+				ECRYPTFS_PASSWORD_SIG_SIZE] = '\0';
+			ECRYPTFS_SET_FLAG(crypt_stat->flags,
+					  ECRYPTFS_ENCRYPTED);
+			break;
+		case ECRYPTFS_TAG_11_PACKET_TYPE:
+			ecryptfs_printk(KERN_WARNING, "Invalid packet set "
+					"(Tag 11 not allowed by itself)\n");
+			rc = -EIO;
+			goto out_wipe_list;
+			break;
+		default:
+			ecryptfs_printk(KERN_DEBUG, "No packet at offset "
+					"[%d] of the file header; hex value of "
+					"character is [0x%.2x]\n", i, src[i]);
+			next_packet_is_auth_tok_packet = 0;
+		}
+	}
+	if (list_empty(&auth_tok_list)) {
+		rc = -EINVAL; /* Do not support non-encrypted files in
+			       * the 0.1 release */
+		goto out;
+	}
+	/* If we have a global auth tok, then we should try to use
+	 * it */
+	if (mount_crypt_stat->global_auth_tok) {
+		memcpy(sig, mount_crypt_stat->global_auth_tok_sig,
+		       ECRYPTFS_SIG_SIZE_HEX);
+		chosen_auth_tok = mount_crypt_stat->global_auth_tok;
+	} else
+		BUG(); /* We should always have a global auth tok in
+			* the 0.1 release */
+	/* Scan list to see if our chosen_auth_tok works */
+	list_for_each(walker, &auth_tok_list) {
+		struct ecryptfs_auth_tok_list_item *auth_tok_list_item;
+		auth_tok_list_item =
+		    list_entry(walker, struct ecryptfs_auth_tok_list_item,
+			       list);
+		candidate_auth_tok = &auth_tok_list_item->auth_tok;
+		if (unlikely(ecryptfs_verbosity > 0)) {
+			ecryptfs_printk(KERN_DEBUG,
+					"Considering cadidate auth tok:\n");
+			ecryptfs_dump_auth_tok(candidate_auth_tok);
+		}
+		/* TODO: Replace ECRYPTFS_SIG_SIZE_HEX w/ dynamic value */
+		if (candidate_auth_tok->token_type == ECRYPTFS_PASSWORD
+		    && !strncmp(candidate_auth_tok->token.password.signature,
+				sig, ECRYPTFS_SIG_SIZE_HEX)) {
+			found_auth_tok = 1;
+			goto leave_list;
+			/* TODO: Transfer the common salt into the
+			 * crypt_stat salt */
+		}
+	}
+leave_list:
+	if (!found_auth_tok) {
+		ecryptfs_printk(KERN_ERR, "Could not find authentication "
+				"token on temporary list for sig [%.*s]\n",
+				ECRYPTFS_SIG_SIZE_HEX, sig);
+		rc = -EIO;
+		goto out_wipe_list;
+	} else {
+		memcpy(&(candidate_auth_tok->token.password),
+		       &(chosen_auth_tok->token.password),
+		       sizeof(struct ecryptfs_password));
+		rc = decrypt_session_key(candidate_auth_tok, crypt_stat);
+		if (rc) {
+			ecryptfs_printk(KERN_ERR, "Error decrypting the "
+					"session key\n");
+			goto out_wipe_list;
+		}
+		rc = ecryptfs_compute_root_iv(crypt_stat);
+		if (rc) {
+			ecryptfs_printk(KERN_ERR, "Error computing "
+					"the root IV\n");
+			goto out_wipe_list;
+		}
+	}
+	rc = ecryptfs_init_crypt_ctx(crypt_stat);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error initializing crypto "
+				"context for cipher [%s]; rc = [%d]\n",
+				crypt_stat->cipher, rc);
+	}
+out_wipe_list:
+	wipe_auth_tok_list(&auth_tok_list);
+out:
+	return rc;
+}
+
+/**
+ * write_tag_11_packet
+ * @dest: Target into which Tag 11 packet is to be written
+ * @max: Maximum packet length
+ * @contents: Byte array of contents to copy in
+ * @contents_length: Number of bytes in contents
+ * @packet_length: Length of the Tag 11 packet written; zero on error
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+write_tag_11_packet(char *dest, int max, char *contents, size_t contents_length,
+		    size_t *packet_length)
+{
+	int rc = 0;
+	size_t packet_size_length;
+
+	(*packet_length) = 0;
+	if ((13 + contents_length) > max) {
+		rc = -EINVAL;
+		ecryptfs_printk(KERN_ERR, "Packet length larger than "
+				"maximum allowable\n");
+		goto out;
+	}
+	/* General packet header */
+	/* Packet tag */
+	dest[(*packet_length)++] = ECRYPTFS_TAG_11_PACKET_TYPE;
+	/* Packet length */
+	rc = write_packet_length(&dest[(*packet_length)],
+				 (13 + contents_length), &packet_size_length);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error generating tag 11 packet "
+				"header; cannot generate packet length\n");
+		goto out;
+	}
+	(*packet_length) += packet_size_length;
+	/* Tag 11 specific */
+	/* One-octet field that describes how the data is formatted */
+	dest[(*packet_length)++] = 0x62; /* binary data */
+	/* One-octet filename length followed by filename */
+	dest[(*packet_length)++] = 8;
+	memcpy(&dest[(*packet_length)], "_CONSOLE", 8);
+	(*packet_length) += 8;
+	/* Four-octet number indicating modification date */
+	memset(&dest[(*packet_length)], 0x00, 4);
+	(*packet_length) += 4;
+	/* Remainder is literal data */
+	memcpy(&dest[(*packet_length)], contents, contents_length);
+	(*packet_length) += contents_length;
+ out:
+	if (rc)
+		(*packet_length) = 0;
+	return rc;
+}
+
+/**
+ * write_tag_3_packet
+ * @dest: Buffer into which to write the packet
+ * @max: Maximum number of bytes that can be written
+ * @auth_tok: Authentication token
+ * @crypt_stat: The cryptographic context
+ * @key_rec: encrypted key
+ * @packet_size: This function will write the number of bytes that end
+ *               up constituting the packet; set to zero on error
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int
+write_tag_3_packet(char *dest, size_t max, struct ecryptfs_auth_tok *auth_tok,
+		   struct ecryptfs_crypt_stat *crypt_stat,
+		   struct ecryptfs_key_record *key_rec, size_t *packet_size)
+{
+	int rc = 0;
+
+	size_t i;
+	size_t signature_is_valid = 0;
+	size_t encrypted_session_key_valid = 0;
+	char session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES];
+	struct scatterlist dest_sg[2];
+	struct scatterlist src_sg[2];
+	struct crypto_tfm *tfm = NULL;
+	struct mutex *tfm_mutex = NULL;
+	size_t key_rec_size;
+	size_t packet_size_length;
+	size_t cipher_code;
+
+	(*packet_size) = 0;
+	/* Check for a valid signature on the auth_tok */
+	for (i = 0; i < ECRYPTFS_SIG_SIZE_HEX; i++)
+		signature_is_valid |= auth_tok->token.password.signature[i];
+	if (!signature_is_valid)
+		BUG();
+	ecryptfs_from_hex((*key_rec).sig, auth_tok->token.password.signature,
+			  ECRYPTFS_SIG_SIZE);
+	encrypted_session_key_valid = 0;
+	for (i = 0; i < crypt_stat->key_size; i++)
+		encrypted_session_key_valid |=
+			auth_tok->session_key.encrypted_key[i];
+	if (encrypted_session_key_valid) {
+		memcpy((*key_rec).enc_key,
+		       auth_tok->session_key.encrypted_key,
+		       auth_tok->session_key.encrypted_key_size);
+		goto encrypted_session_key_set;
+	}
+	if (auth_tok->session_key.encrypted_key_size == 0)
+		auth_tok->session_key.encrypted_key_size =
+			crypt_stat->key_size;
+	if (crypt_stat->key_size == 24
+	    && strcmp("aes", crypt_stat->cipher) == 0) {
+		memset((crypt_stat->key + 24), 0, 8);
+		auth_tok->session_key.encrypted_key_size = 32;
+	}
+	(*key_rec).enc_key_size =
+		auth_tok->session_key.encrypted_key_size;
+	if (ECRYPTFS_CHECK_FLAG(auth_tok->token.password.flags,
+				ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET)) {
+		ecryptfs_printk(KERN_DEBUG, "Using previously generated "
+				"session key encryption key of size [%d]\n",
+				auth_tok->token.password.
+				session_key_encryption_key_bytes);
+		memcpy(session_key_encryption_key,
+		       auth_tok->token.password.session_key_encryption_key,
+		       crypt_stat->key_size);
+		ecryptfs_printk(KERN_DEBUG,
+				"Cached session key " "encryption key: \n");
+		if (ecryptfs_verbosity > 0)
+			ecryptfs_dump_hex(session_key_encryption_key, 16);
+	}
+	if (unlikely(ecryptfs_verbosity > 0)) {
+		ecryptfs_printk(KERN_DEBUG, "Session key encryption key:\n");
+		ecryptfs_dump_hex(session_key_encryption_key, 16);
+	}
+	rc = virt_to_scatterlist(crypt_stat->key,
+				 (*key_rec).enc_key_size, src_sg, 2);
+	if (!rc) {
+		ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
+				"for crypt_stat session key\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	rc = virt_to_scatterlist((*key_rec).enc_key,
+				 (*key_rec).enc_key_size, dest_sg, 2);
+	if (!rc) {
+		ecryptfs_printk(KERN_ERR, "Error generating scatterlist "
+				"for crypt_stat encrypted session key\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	if (!strcmp(crypt_stat->cipher,
+		    crypt_stat->mount_crypt_stat->global_default_cipher_name)
+	    && crypt_stat->mount_crypt_stat->global_key_tfm) {
+		tfm = crypt_stat->mount_crypt_stat->global_key_tfm;
+		tfm_mutex = &crypt_stat->mount_crypt_stat->global_key_tfm_mutex;
+	} else
+		tfm = crypto_alloc_tfm(crypt_stat->cipher, 0);
+	if (!tfm) {
+		ecryptfs_printk(KERN_ERR, "Could not initialize crypto "
+				"context for cipher [%s]\n",
+				crypt_stat->cipher);
+		rc = -EINVAL;
+		goto out;
+	}
+	if (tfm_mutex)
+		mutex_lock(tfm_mutex);
+	rc = crypto_cipher_setkey(tfm, session_key_encryption_key,
+				  crypt_stat->key_size);
+	if (rc < 0) {
+		if (tfm_mutex)
+			mutex_unlock(tfm_mutex);
+		ecryptfs_printk(KERN_ERR, "Error setting key for crypto "
+				"context\n");
+		goto out;
+	}
+	rc = 0;
+	ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n",
+			crypt_stat->key_size);
+	crypto_cipher_encrypt(tfm, dest_sg, src_sg,
+			      (*key_rec).enc_key_size);
+	if (tfm_mutex)
+		mutex_unlock(tfm_mutex);
+	ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n");
+	if (ecryptfs_verbosity > 0)
+		ecryptfs_dump_hex((*key_rec).enc_key,
+				  (*key_rec).enc_key_size);
+encrypted_session_key_set:
+	/* Now we have a valid key_rec.  Append it to the
+	 * key_rec set. */
+	key_rec_size = (sizeof(struct ecryptfs_key_record)
+			- ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES
+			+ ((*key_rec).enc_key_size));
+	/* TODO: Include a packet size limit as a parameter to this
+	 * function once we have multi-packet headers (for versions
+	 * later than 0.1 */
+	if (key_rec_size >= ECRYPTFS_MAX_KEYSET_SIZE) {
+		ecryptfs_printk(KERN_ERR, "Keyset too large\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	/* TODO: Packet size limit */
+	/* We have 5 bytes of surrounding packet data */
+	if ((0x05 + ECRYPTFS_SALT_SIZE
+	     + (*key_rec).enc_key_size) >= max) {
+		ecryptfs_printk(KERN_ERR, "Authentication token is too "
+				"large\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	/* This format is inspired by OpenPGP; see RFC 2440
+	 * packet tag 3 */
+	dest[(*packet_size)++] = ECRYPTFS_TAG_3_PACKET_TYPE;
+	/* ver+cipher+s2k+hash+salt+iter+enc_key */
+	rc = write_packet_length(&dest[(*packet_size)],
+				 (0x05 + ECRYPTFS_SALT_SIZE
+				  + (*key_rec).enc_key_size),
+				 &packet_size_length);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error generating tag 3 packet "
+				"header; cannot generate packet length\n");
+		goto out;
+	}
+	(*packet_size) += packet_size_length;
+	dest[(*packet_size)++] = 0x04; /* version 4 */
+	cipher_code = ecryptfs_code_for_cipher_string(crypt_stat);
+	if (cipher_code == 0) {
+		ecryptfs_printk(KERN_WARNING, "Unable to generate code for "
+				"cipher [%s]\n", crypt_stat->cipher);
+		rc = -EINVAL;
+		goto out;
+	}
+	dest[(*packet_size)++] = cipher_code;
+	dest[(*packet_size)++] = 0x03;	/* S2K */
+	dest[(*packet_size)++] = 0x01;	/* MD5 (TODO: parameterize) */
+	memcpy(&dest[(*packet_size)], auth_tok->token.password.salt,
+	       ECRYPTFS_SALT_SIZE);
+	(*packet_size) += ECRYPTFS_SALT_SIZE;	/* salt */
+	dest[(*packet_size)++] = 0x60;	/* hash iterations (65536) */
+	memcpy(&dest[(*packet_size)], (*key_rec).enc_key,
+	       (*key_rec).enc_key_size);
+	(*packet_size) += (*key_rec).enc_key_size;
+out:
+	if (tfm && !tfm_mutex)
+		crypto_free_tfm(tfm);
+	if (rc)
+		(*packet_size) = 0;
+	return rc;
+}
+
+/**
+ * ecryptfs_generate_key_packet_set
+ * @dest: Virtual address from which to write the key record set
+ * @crypt_stat: The cryptographic context from which the
+ *              authentication tokens will be retrieved
+ * @ecryptfs_dentry: The dentry, used to retrieve the mount crypt stat
+ *                   for the global parameters
+ * @len: The amount written
+ * @max: The maximum amount of data allowed to be written
+ *
+ * Generates a key packet set and writes it to the virtual address
+ * passed in.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+int
+ecryptfs_generate_key_packet_set(char *dest_base,
+				 struct ecryptfs_crypt_stat *crypt_stat,
+				 struct dentry *ecryptfs_dentry, size_t *len,
+				 size_t max)
+{
+	int rc = 0;
+	struct ecryptfs_auth_tok *auth_tok;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+		&ecryptfs_superblock_to_private(
+			ecryptfs_dentry->d_sb)->mount_crypt_stat;
+	size_t written;
+	struct ecryptfs_key_record key_rec;
+
+	(*len) = 0;
+	if (mount_crypt_stat->global_auth_tok) {
+		auth_tok = mount_crypt_stat->global_auth_tok;
+		if (auth_tok->token_type == ECRYPTFS_PASSWORD) {
+			rc = write_tag_3_packet((dest_base + (*len)),
+						max, auth_tok,
+						crypt_stat, &key_rec,
+						&written);
+			if (rc) {
+				ecryptfs_printk(KERN_WARNING, "Error "
+						"writing tag 3 packet\n");
+				goto out;
+			}
+			(*len) += written;
+			/* Write auth tok signature packet */
+			rc = write_tag_11_packet(
+				(dest_base + (*len)),
+				(max - (*len)),
+				key_rec.sig, ECRYPTFS_SIG_SIZE, &written);
+			if (rc) {
+				ecryptfs_printk(KERN_ERR, "Error writing "
+						"auth tok signature packet\n");
+				goto out;
+			}
+			(*len) += written;
+		} else {
+			ecryptfs_printk(KERN_WARNING, "Unsupported "
+					"authentication token type\n");
+			rc = -EINVAL;
+			goto out;
+		}
+		if (rc) {
+			ecryptfs_printk(KERN_WARNING, "Error writing "
+					"authentication token packet with sig "
+					"= [%s]\n",
+					mount_crypt_stat->global_auth_tok_sig);
+			rc = -EIO;
+			goto out;
+		}
+	} else
+		BUG();
+	if (likely((max - (*len)) > 0)) {
+		dest_base[(*len)] = 0x00;
+	} else {
+		ecryptfs_printk(KERN_ERR, "Error writing boundary byte\n");
+		rc = -EIO;
+	}
+out:
+	if (rc)
+		(*len) = 0;
+	return rc;
+}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
new file mode 100644
index 000000000000..5938a232d11b
--- /dev/null
+++ b/fs/ecryptfs/main.c
@@ -0,0 +1,828 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *              Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/dcache.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+#include <linux/skbuff.h>
+#include <linux/crypto.h>
+#include <linux/netlink.h>
+#include <linux/mount.h>
+#include <linux/dcache.h>
+#include <linux/pagemap.h>
+#include <linux/key.h>
+#include <linux/parser.h>
+#include "ecryptfs_kernel.h"
+
+/**
+ * Module parameter that defines the ecryptfs_verbosity level.
+ */
+int ecryptfs_verbosity = 0;
+
+module_param(ecryptfs_verbosity, int, 0);
+MODULE_PARM_DESC(ecryptfs_verbosity,
+		 "Initial verbosity level (0 or 1; defaults to "
+		 "0, which is Quiet)");
+
+void __ecryptfs_printk(const char *fmt, ...)
+{
+	va_list args;
+	va_start(args, fmt);
+	if (fmt[1] == '7') { /* KERN_DEBUG */
+		if (ecryptfs_verbosity >= 1)
+			vprintk(fmt, args);
+	} else
+		vprintk(fmt, args);
+	va_end(args);
+}
+
+/**
+ * ecryptfs_interpose
+ * @lower_dentry: Existing dentry in the lower filesystem
+ * @dentry: ecryptfs' dentry
+ * @sb: ecryptfs's super_block
+ * @flag: If set to true, then d_add is called, else d_instantiate is called
+ *
+ * Interposes upper and lower dentries.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
+		       struct super_block *sb, int flag)
+{
+	struct inode *lower_inode;
+	struct inode *inode;
+	int rc = 0;
+
+	lower_inode = lower_dentry->d_inode;
+	if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
+		rc = -EXDEV;
+		goto out;
+	}
+	if (!igrab(lower_inode)) {
+		rc = -ESTALE;
+		goto out;
+	}
+	inode = iget5_locked(sb, (unsigned long)lower_inode,
+			     ecryptfs_inode_test, ecryptfs_inode_set,
+			     lower_inode);
+	if (!inode) {
+		rc = -EACCES;
+		iput(lower_inode);
+		goto out;
+	}
+	if (inode->i_state & I_NEW)
+		unlock_new_inode(inode);
+	else
+		iput(lower_inode);
+	if (S_ISLNK(lower_inode->i_mode))
+		inode->i_op = &ecryptfs_symlink_iops;
+	else if (S_ISDIR(lower_inode->i_mode))
+		inode->i_op = &ecryptfs_dir_iops;
+	if (S_ISDIR(lower_inode->i_mode))
+		inode->i_fop = &ecryptfs_dir_fops;
+	if (special_file(lower_inode->i_mode))
+		init_special_inode(inode, lower_inode->i_mode,
+				   lower_inode->i_rdev);
+	dentry->d_op = &ecryptfs_dops;
+	if (flag)
+		d_add(dentry, inode);
+	else
+		d_instantiate(dentry, inode);
+	ecryptfs_copy_attr_all(inode, lower_inode);
+	/* This size will be overwritten for real files w/ headers and
+	 * other metadata */
+	ecryptfs_copy_inode_size(inode, lower_inode);
+out:
+	return rc;
+}
+
+enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_debug,
+       ecryptfs_opt_ecryptfs_debug, ecryptfs_opt_cipher,
+       ecryptfs_opt_ecryptfs_cipher, ecryptfs_opt_ecryptfs_key_bytes,
+       ecryptfs_opt_passthrough, ecryptfs_opt_err };
+
+static match_table_t tokens = {
+	{ecryptfs_opt_sig, "sig=%s"},
+	{ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"},
+	{ecryptfs_opt_debug, "debug=%u"},
+	{ecryptfs_opt_ecryptfs_debug, "ecryptfs_debug=%u"},
+	{ecryptfs_opt_cipher, "cipher=%s"},
+	{ecryptfs_opt_ecryptfs_cipher, "ecryptfs_cipher=%s"},
+	{ecryptfs_opt_ecryptfs_key_bytes, "ecryptfs_key_bytes=%u"},
+	{ecryptfs_opt_passthrough, "ecryptfs_passthrough"},
+	{ecryptfs_opt_err, NULL}
+};
+
+/**
+ * ecryptfs_verify_version
+ * @version: The version number to confirm
+ *
+ * Returns zero on good version; non-zero otherwise
+ */
+static int ecryptfs_verify_version(u16 version)
+{
+	int rc = 0;
+	unsigned char major;
+	unsigned char minor;
+
+	major = ((version >> 8) & 0xFF);
+	minor = (version & 0xFF);
+	if (major != ECRYPTFS_VERSION_MAJOR) {
+		ecryptfs_printk(KERN_ERR, "Major version number mismatch. "
+				"Expected [%d]; got [%d]\n",
+				ECRYPTFS_VERSION_MAJOR, major);
+		rc = -EINVAL;
+		goto out;
+	}
+	if (minor != ECRYPTFS_VERSION_MINOR) {
+		ecryptfs_printk(KERN_ERR, "Minor version number mismatch. "
+				"Expected [%d]; got [%d]\n",
+				ECRYPTFS_VERSION_MINOR, minor);
+		rc = -EINVAL;
+		goto out;
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_parse_options
+ * @sb: The ecryptfs super block
+ * @options: The options pased to the kernel
+ *
+ * Parse mount options:
+ * debug=N 	   - ecryptfs_verbosity level for debug output
+ * sig=XXX	   - description(signature) of the key to use
+ *
+ * Returns the dentry object of the lower-level (lower/interposed)
+ * directory; We want to mount our stackable file system on top of
+ * that lower directory.
+ *
+ * The signature of the key to use must be the description of a key
+ * already in the keyring. Mounting will fail if the key can not be
+ * found.
+ *
+ * Returns zero on success; non-zero on error
+ */
+static int ecryptfs_parse_options(struct super_block *sb, char *options)
+{
+	char *p;
+	int rc = 0;
+	int sig_set = 0;
+	int cipher_name_set = 0;
+	int cipher_key_bytes;
+	int cipher_key_bytes_set = 0;
+	struct key *auth_tok_key = NULL;
+	struct ecryptfs_auth_tok *auth_tok = NULL;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+		&ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
+	substring_t args[MAX_OPT_ARGS];
+	int token;
+	char *sig_src;
+	char *sig_dst;
+	char *debug_src;
+	char *cipher_name_dst;
+	char *cipher_name_src;
+	char *cipher_key_bytes_src;
+	struct crypto_tfm *tmp_tfm;
+	int cipher_name_len;
+
+	if (!options) {
+		rc = -EINVAL;
+		goto out;
+	}
+	while ((p = strsep(&options, ",")) != NULL) {
+		if (!*p)
+			continue;
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case ecryptfs_opt_sig:
+		case ecryptfs_opt_ecryptfs_sig:
+			sig_src = args[0].from;
+			sig_dst =
+				mount_crypt_stat->global_auth_tok_sig;
+			memcpy(sig_dst, sig_src, ECRYPTFS_SIG_SIZE_HEX);
+			sig_dst[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+			ecryptfs_printk(KERN_DEBUG,
+					"The mount_crypt_stat "
+					"global_auth_tok_sig set to: "
+					"[%s]\n", sig_dst);
+			sig_set = 1;
+			break;
+		case ecryptfs_opt_debug:
+		case ecryptfs_opt_ecryptfs_debug:
+			debug_src = args[0].from;
+			ecryptfs_verbosity =
+				(int)simple_strtol(debug_src, &debug_src,
+						   0);
+			ecryptfs_printk(KERN_DEBUG,
+					"Verbosity set to [%d]" "\n",
+					ecryptfs_verbosity);
+			break;
+		case ecryptfs_opt_cipher:
+		case ecryptfs_opt_ecryptfs_cipher:
+			cipher_name_src = args[0].from;
+			cipher_name_dst =
+				mount_crypt_stat->
+				global_default_cipher_name;
+			strncpy(cipher_name_dst, cipher_name_src,
+				ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+			ecryptfs_printk(KERN_DEBUG,
+					"The mount_crypt_stat "
+					"global_default_cipher_name set to: "
+					"[%s]\n", cipher_name_dst);
+			cipher_name_set = 1;
+			break;
+		case ecryptfs_opt_ecryptfs_key_bytes:
+			cipher_key_bytes_src = args[0].from;
+			cipher_key_bytes =
+				(int)simple_strtol(cipher_key_bytes_src,
+						   &cipher_key_bytes_src, 0);
+			mount_crypt_stat->global_default_cipher_key_size =
+				cipher_key_bytes;
+			ecryptfs_printk(KERN_DEBUG,
+					"The mount_crypt_stat "
+					"global_default_cipher_key_size "
+					"set to: [%d]\n", mount_crypt_stat->
+					global_default_cipher_key_size);
+			cipher_key_bytes_set = 1;
+			break;
+		case ecryptfs_opt_passthrough:
+			mount_crypt_stat->flags |=
+				ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED;
+			break;
+		case ecryptfs_opt_err:
+		default:
+			ecryptfs_printk(KERN_WARNING,
+					"eCryptfs: unrecognized option '%s'\n",
+					p);
+		}
+	}
+	/* Do not support lack of mount-wide signature in 0.1
+	 * release */
+	if (!sig_set) {
+		rc = -EINVAL;
+		ecryptfs_printk(KERN_ERR, "You must supply a valid "
+				"passphrase auth tok signature as a mount "
+				"parameter; see the eCryptfs README\n");
+		goto out;
+	}
+	if (!cipher_name_set) {
+		cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
+		if (unlikely(cipher_name_len
+			     >= ECRYPTFS_MAX_CIPHER_NAME_SIZE)) {
+			rc = -EINVAL;
+			BUG();
+			goto out;
+		}
+		memcpy(mount_crypt_stat->global_default_cipher_name,
+		       ECRYPTFS_DEFAULT_CIPHER, cipher_name_len);
+		mount_crypt_stat->global_default_cipher_name[cipher_name_len]
+		    = '\0';
+	}
+	if (!cipher_key_bytes_set) {
+		mount_crypt_stat->global_default_cipher_key_size =
+			ECRYPTFS_DEFAULT_KEY_BYTES;
+		ecryptfs_printk(KERN_DEBUG, "Cipher key size was not "
+				"specified.  Defaulting to [%d]\n",
+				mount_crypt_stat->
+				global_default_cipher_key_size);
+	}
+	rc = ecryptfs_process_cipher(
+		&tmp_tfm,
+		&mount_crypt_stat->global_key_tfm,
+		mount_crypt_stat->global_default_cipher_name,
+		mount_crypt_stat->global_default_cipher_key_size);
+	if (tmp_tfm)
+		crypto_free_tfm(tmp_tfm);
+	if (rc) {
+		printk(KERN_ERR "Error attempting to initialize cipher [%s] "
+		       "with key size [%Zd] bytes; rc = [%d]\n",
+		       mount_crypt_stat->global_default_cipher_name,
+		       mount_crypt_stat->global_default_cipher_key_size, rc);
+		rc = -EINVAL;
+		goto out;
+	}
+	mutex_init(&mount_crypt_stat->global_key_tfm_mutex);
+	ecryptfs_printk(KERN_DEBUG, "Requesting the key with description: "
+			"[%s]\n", mount_crypt_stat->global_auth_tok_sig);
+	/* The reference to this key is held until umount is done The
+	 * call to key_put is done in ecryptfs_put_super() */
+	auth_tok_key = request_key(&key_type_user,
+				   mount_crypt_stat->global_auth_tok_sig,
+				   NULL);
+	if (!auth_tok_key || IS_ERR(auth_tok_key)) {
+		ecryptfs_printk(KERN_ERR, "Could not find key with "
+				"description: [%s]\n",
+				mount_crypt_stat->global_auth_tok_sig);
+		process_request_key_err(PTR_ERR(auth_tok_key));
+		rc = -EINVAL;
+		goto out;
+	}
+	auth_tok = ecryptfs_get_key_payload_data(auth_tok_key);
+	if (ecryptfs_verify_version(auth_tok->version)) {
+		ecryptfs_printk(KERN_ERR, "Data structure version mismatch. "
+				"Userspace tools must match eCryptfs kernel "
+				"module with major version [%d] and minor "
+				"version [%d]\n", ECRYPTFS_VERSION_MAJOR,
+				ECRYPTFS_VERSION_MINOR);
+		rc = -EINVAL;
+		goto out;
+	}
+	if (auth_tok->token_type != ECRYPTFS_PASSWORD) {
+		ecryptfs_printk(KERN_ERR, "Invalid auth_tok structure "
+				"returned from key\n");
+		rc = -EINVAL;
+		goto out;
+	}
+	mount_crypt_stat->global_auth_tok_key = auth_tok_key;
+	mount_crypt_stat->global_auth_tok = auth_tok;
+out:
+	return rc;
+}
+
+struct kmem_cache *ecryptfs_sb_info_cache;
+
+/**
+ * ecryptfs_fill_super
+ * @sb: The ecryptfs super block
+ * @raw_data: The options passed to mount
+ * @silent: Not used but required by function prototype
+ *
+ * Sets up what we can of the sb, rest is done in ecryptfs_read_super
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int
+ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
+{
+	int rc = 0;
+
+	/* Released in ecryptfs_put_super() */
+	ecryptfs_set_superblock_private(sb,
+					kmem_cache_alloc(ecryptfs_sb_info_cache,
+							 SLAB_KERNEL));
+	if (!ecryptfs_superblock_to_private(sb)) {
+		ecryptfs_printk(KERN_WARNING, "Out of memory\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	memset(ecryptfs_superblock_to_private(sb), 0,
+	       sizeof(struct ecryptfs_sb_info));
+	sb->s_op = &ecryptfs_sops;
+	/* Released through deactivate_super(sb) from get_sb_nodev */
+	sb->s_root = d_alloc(NULL, &(const struct qstr) {
+			     .hash = 0,.name = "/",.len = 1});
+	if (!sb->s_root) {
+		ecryptfs_printk(KERN_ERR, "d_alloc failed\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	sb->s_root->d_op = &ecryptfs_dops;
+	sb->s_root->d_sb = sb;
+	sb->s_root->d_parent = sb->s_root;
+	/* Released in d_release when dput(sb->s_root) is called */
+	/* through deactivate_super(sb) from get_sb_nodev() */
+	ecryptfs_set_dentry_private(sb->s_root,
+				    kmem_cache_alloc(ecryptfs_dentry_info_cache,
+						     SLAB_KERNEL));
+	if (!ecryptfs_dentry_to_private(sb->s_root)) {
+		ecryptfs_printk(KERN_ERR,
+				"dentry_info_cache alloc failed\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+	memset(ecryptfs_dentry_to_private(sb->s_root), 0,
+	       sizeof(struct ecryptfs_dentry_info));
+	rc = 0;
+out:
+	/* Should be able to rely on deactivate_super called from
+	 * get_sb_nodev */
+	return rc;
+}
+
+/**
+ * ecryptfs_read_super
+ * @sb: The ecryptfs super block
+ * @dev_name: The path to mount over
+ *
+ * Read the super block of the lower filesystem, and use
+ * ecryptfs_interpose to create our initial inode and super block
+ * struct.
+ */
+static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
+{
+	int rc;
+	struct nameidata nd;
+	struct dentry *lower_root;
+	struct vfsmount *lower_mnt;
+
+	memset(&nd, 0, sizeof(struct nameidata));
+	rc = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
+		goto out_free;
+	}
+	lower_root = nd.dentry;
+	if (!lower_root->d_inode) {
+		ecryptfs_printk(KERN_WARNING,
+				"No directory to interpose on\n");
+		rc = -ENOENT;
+		goto out_free;
+	}
+	lower_mnt = nd.mnt;
+	ecryptfs_set_superblock_lower(sb, lower_root->d_sb);
+	sb->s_maxbytes = lower_root->d_sb->s_maxbytes;
+	ecryptfs_set_dentry_lower(sb->s_root, lower_root);
+	ecryptfs_set_dentry_lower_mnt(sb->s_root, lower_mnt);
+	if ((rc = ecryptfs_interpose(lower_root, sb->s_root, sb, 0)))
+		goto out_free;
+	rc = 0;
+	goto out;
+out_free:
+	path_release(&nd);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_get_sb
+ * @fs_type
+ * @flags
+ * @dev_name: The path to mount over
+ * @raw_data: The options passed into the kernel
+ *
+ * The whole ecryptfs_get_sb process is broken into 4 functions:
+ * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
+ * ecryptfs_fill_super(): used by get_sb_nodev, fills out the super_block
+ *                        with as much information as it can before needing
+ *                        the lower filesystem.
+ * ecryptfs_read_super(): this accesses the lower filesystem and uses
+ *                        ecryptfs_interpolate to perform most of the linking
+ * ecryptfs_interpolate(): links the lower filesystem into ecryptfs
+ */
+static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
+			const char *dev_name, void *raw_data,
+			struct vfsmount *mnt)
+{
+	int rc;
+	struct super_block *sb;
+
+	rc = get_sb_nodev(fs_type, flags, raw_data, ecryptfs_fill_super, mnt);
+	if (rc < 0) {
+		printk(KERN_ERR "Getting sb failed; rc = [%d]\n", rc);
+		goto out;
+	}
+	sb = mnt->mnt_sb;
+	rc = ecryptfs_parse_options(sb, raw_data);
+	if (rc) {
+		printk(KERN_ERR "Error parsing options; rc = [%d]\n", rc);
+		goto out_abort;
+	}
+	rc = ecryptfs_read_super(sb, dev_name);
+	if (rc) {
+		printk(KERN_ERR "Reading sb failed; rc = [%d]\n", rc);
+		goto out_abort;
+	}
+	goto out;
+out_abort:
+	dput(sb->s_root);
+	up_write(&sb->s_umount);
+	deactivate_super(sb);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_kill_block_super
+ * @sb: The ecryptfs super block
+ *
+ * Used to bring the superblock down and free the private data.
+ * Private data is free'd in ecryptfs_put_super()
+ */
+static void ecryptfs_kill_block_super(struct super_block *sb)
+{
+	generic_shutdown_super(sb);
+}
+
+static struct file_system_type ecryptfs_fs_type = {
+	.owner = THIS_MODULE,
+	.name = "ecryptfs",
+	.get_sb = ecryptfs_get_sb,
+	.kill_sb = ecryptfs_kill_block_super,
+	.fs_flags = 0
+};
+
+/**
+ * inode_info_init_once
+ *
+ * Initializes the ecryptfs_inode_info_cache when it is created
+ */
+static void
+inode_info_init_once(void *vptr, struct kmem_cache *cachep, unsigned long flags)
+{
+	struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr;
+
+	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR)
+		inode_init_once(&ei->vfs_inode);
+}
+
+static struct ecryptfs_cache_info {
+	kmem_cache_t **cache;
+	const char *name;
+	size_t size;
+	void (*ctor)(void*, struct kmem_cache *, unsigned long);
+} ecryptfs_cache_infos[] = {
+	{
+		.cache = &ecryptfs_auth_tok_list_item_cache,
+		.name = "ecryptfs_auth_tok_list_item",
+		.size = sizeof(struct ecryptfs_auth_tok_list_item),
+	},
+	{
+		.cache = &ecryptfs_file_info_cache,
+		.name = "ecryptfs_file_cache",
+		.size = sizeof(struct ecryptfs_file_info),
+	},
+	{
+		.cache = &ecryptfs_dentry_info_cache,
+		.name = "ecryptfs_dentry_info_cache",
+		.size = sizeof(struct ecryptfs_dentry_info),
+	},
+	{
+		.cache = &ecryptfs_inode_info_cache,
+		.name = "ecryptfs_inode_cache",
+		.size = sizeof(struct ecryptfs_inode_info),
+		.ctor = inode_info_init_once,
+	},
+	{
+		.cache = &ecryptfs_sb_info_cache,
+		.name = "ecryptfs_sb_cache",
+		.size = sizeof(struct ecryptfs_sb_info),
+	},
+	{
+		.cache = &ecryptfs_header_cache_0,
+		.name = "ecryptfs_headers_0",
+		.size = PAGE_CACHE_SIZE,
+	},
+	{
+		.cache = &ecryptfs_header_cache_1,
+		.name = "ecryptfs_headers_1",
+		.size = PAGE_CACHE_SIZE,
+	},
+	{
+		.cache = &ecryptfs_header_cache_2,
+		.name = "ecryptfs_headers_2",
+		.size = PAGE_CACHE_SIZE,
+	},
+	{
+		.cache = &ecryptfs_lower_page_cache,
+		.name = "ecryptfs_lower_page_cache",
+		.size = PAGE_CACHE_SIZE,
+	},
+};
+
+static void ecryptfs_free_kmem_caches(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) {
+		struct ecryptfs_cache_info *info;
+
+		info = &ecryptfs_cache_infos[i];
+		if (*(info->cache))
+			kmem_cache_destroy(*(info->cache));
+	}
+}
+
+/**
+ * ecryptfs_init_kmem_caches
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_init_kmem_caches(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) {
+		struct ecryptfs_cache_info *info;
+
+		info = &ecryptfs_cache_infos[i];
+		*(info->cache) = kmem_cache_create(info->name, info->size,
+				0, SLAB_HWCACHE_ALIGN, info->ctor, NULL);
+		if (!*(info->cache)) {
+			ecryptfs_free_kmem_caches();
+			ecryptfs_printk(KERN_WARNING, "%s: "
+					"kmem_cache_create failed\n",
+					info->name);
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+struct ecryptfs_obj {
+	char *name;
+	struct list_head slot_list;
+	struct kobject kobj;
+};
+
+struct ecryptfs_attribute {
+	struct attribute attr;
+	ssize_t(*show) (struct ecryptfs_obj *, char *);
+	ssize_t(*store) (struct ecryptfs_obj *, const char *, size_t);
+};
+
+static ssize_t
+ecryptfs_attr_store(struct kobject *kobj,
+		    struct attribute *attr, const char *buf, size_t len)
+{
+	struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj,
+						kobj);
+	struct ecryptfs_attribute *attribute =
+		container_of(attr, struct ecryptfs_attribute, attr);
+
+	return (attribute->store ? attribute->store(obj, buf, len) : 0);
+}
+
+static ssize_t
+ecryptfs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj,
+						kobj);
+	struct ecryptfs_attribute *attribute =
+		container_of(attr, struct ecryptfs_attribute, attr);
+
+	return (attribute->show ? attribute->show(obj, buf) : 0);
+}
+
+static struct sysfs_ops ecryptfs_sysfs_ops = {
+	.show = ecryptfs_attr_show,
+	.store = ecryptfs_attr_store
+};
+
+static struct kobj_type ecryptfs_ktype = {
+	.sysfs_ops = &ecryptfs_sysfs_ops
+};
+
+static decl_subsys(ecryptfs, &ecryptfs_ktype, NULL);
+
+static ssize_t version_show(struct ecryptfs_obj *obj, char *buff)
+{
+	return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
+}
+
+static struct ecryptfs_attribute sysfs_attr_version = __ATTR_RO(version);
+
+struct ecryptfs_version_str_map_elem {
+	u32 flag;
+	char *str;
+} ecryptfs_version_str_map[] = {
+	{ECRYPTFS_VERSIONING_PASSPHRASE, "passphrase"},
+	{ECRYPTFS_VERSIONING_PUBKEY, "pubkey"},
+	{ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH, "plaintext passthrough"},
+	{ECRYPTFS_VERSIONING_POLICY, "policy"}
+};
+
+static ssize_t version_str_show(struct ecryptfs_obj *obj, char *buff)
+{
+	int i;
+	int remaining = PAGE_SIZE;
+	int total_written = 0;
+
+	buff[0] = '\0';
+	for (i = 0; i < ARRAY_SIZE(ecryptfs_version_str_map); i++) {
+		int entry_size;
+
+		if (!(ECRYPTFS_VERSIONING_MASK
+		      & ecryptfs_version_str_map[i].flag))
+			continue;
+		entry_size = strlen(ecryptfs_version_str_map[i].str);
+		if ((entry_size + 2) > remaining)
+			goto out;
+		memcpy(buff, ecryptfs_version_str_map[i].str, entry_size);
+		buff[entry_size++] = '\n';
+		buff[entry_size] = '\0';
+		buff += entry_size;
+		total_written += entry_size;
+		remaining -= entry_size;
+	}
+out:
+	return total_written;
+}
+
+static struct ecryptfs_attribute sysfs_attr_version_str = __ATTR_RO(version_str);
+
+static int do_sysfs_registration(void)
+{
+	int rc;
+
+	if ((rc = subsystem_register(&ecryptfs_subsys))) {
+		printk(KERN_ERR
+		       "Unable to register ecryptfs sysfs subsystem\n");
+		goto out;
+	}
+	rc = sysfs_create_file(&ecryptfs_subsys.kset.kobj,
+			       &sysfs_attr_version.attr);
+	if (rc) {
+		printk(KERN_ERR
+		       "Unable to create ecryptfs version attribute\n");
+		subsystem_unregister(&ecryptfs_subsys);
+		goto out;
+	}
+	rc = sysfs_create_file(&ecryptfs_subsys.kset.kobj,
+			       &sysfs_attr_version_str.attr);
+	if (rc) {
+		printk(KERN_ERR
+		       "Unable to create ecryptfs version_str attribute\n");
+		sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
+				  &sysfs_attr_version.attr);
+		subsystem_unregister(&ecryptfs_subsys);
+		goto out;
+	}
+out:
+	return rc;
+}
+
+static int __init ecryptfs_init(void)
+{
+	int rc;
+
+	if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_CACHE_SIZE) {
+		rc = -EINVAL;
+		ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is "
+				"larger than the host's page size, and so "
+				"eCryptfs cannot run on this system. The "
+				"default eCryptfs extent size is [%d] bytes; "
+				"the page size is [%d] bytes.\n",
+				ECRYPTFS_DEFAULT_EXTENT_SIZE, PAGE_CACHE_SIZE);
+		goto out;
+	}
+	rc = ecryptfs_init_kmem_caches();
+	if (rc) {
+		printk(KERN_ERR
+		       "Failed to allocate one or more kmem_cache objects\n");
+		goto out;
+	}
+	rc = register_filesystem(&ecryptfs_fs_type);
+	if (rc) {
+		printk(KERN_ERR "Failed to register filesystem\n");
+		ecryptfs_free_kmem_caches();
+		goto out;
+	}
+	kset_set_kset_s(&ecryptfs_subsys, fs_subsys);
+	sysfs_attr_version.attr.owner = THIS_MODULE;
+	sysfs_attr_version_str.attr.owner = THIS_MODULE;
+	rc = do_sysfs_registration();
+	if (rc) {
+		printk(KERN_ERR "sysfs registration failed\n");
+		unregister_filesystem(&ecryptfs_fs_type);
+		ecryptfs_free_kmem_caches();
+		goto out;
+	}
+out:
+	return rc;
+}
+
+static void __exit ecryptfs_exit(void)
+{
+	sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
+			  &sysfs_attr_version.attr);
+	sysfs_remove_file(&ecryptfs_subsys.kset.kobj,
+			  &sysfs_attr_version_str.attr);
+	subsystem_unregister(&ecryptfs_subsys);
+	unregister_filesystem(&ecryptfs_fs_type);
+	ecryptfs_free_kmem_caches();
+}
+
+MODULE_AUTHOR("Michael A. Halcrow <mhalcrow@us.ibm.com>");
+MODULE_DESCRIPTION("eCryptfs");
+
+MODULE_LICENSE("GPL");
+
+module_init(ecryptfs_init)
+module_exit(ecryptfs_exit)
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
new file mode 100644
index 000000000000..924dd90a4cf5
--- /dev/null
+++ b/fs/ecryptfs/mmap.c
@@ -0,0 +1,788 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ * This is where eCryptfs coordinates the symmetric encryption and
+ * decryption of the file data as it passes between the lower
+ * encrypted file and the upper decrypted file.
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/page-flags.h>
+#include <linux/mount.h>
+#include <linux/file.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include "ecryptfs_kernel.h"
+
+struct kmem_cache *ecryptfs_lower_page_cache;
+
+/**
+ * ecryptfs_get1page
+ *
+ * Get one page from cache or lower f/s, return error otherwise.
+ *
+ * Returns unlocked and up-to-date page (if ok), with increased
+ * refcnt.
+ */
+static struct page *ecryptfs_get1page(struct file *file, int index)
+{
+	struct page *page;
+	struct dentry *dentry;
+	struct inode *inode;
+	struct address_space *mapping;
+
+	dentry = file->f_dentry;
+	inode = dentry->d_inode;
+	mapping = inode->i_mapping;
+	page = read_cache_page(mapping, index,
+			       (filler_t *)mapping->a_ops->readpage,
+			       (void *)file);
+	if (IS_ERR(page))
+		goto out;
+	wait_on_page_locked(page);
+out:
+	return page;
+}
+
+static
+int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros);
+
+/**
+ * ecryptfs_fill_zeros
+ * @file: The ecryptfs file
+ * @new_length: The new length of the data in the underlying file;
+ *              everything between the prior end of the file and the
+ *              new end of the file will be filled with zero's.
+ *              new_length must be greater than  current length
+ *
+ * Function for handling lseek-ing past the end of the file.
+ *
+ * This function does not support shrinking, only growing a file.
+ *
+ * Returns zero on success; non-zero otherwise.
+ */
+int ecryptfs_fill_zeros(struct file *file, loff_t new_length)
+{
+	int rc = 0;
+	struct dentry *dentry = file->f_dentry;
+	struct inode *inode = dentry->d_inode;
+	pgoff_t old_end_page_index = 0;
+	pgoff_t index = old_end_page_index;
+	int old_end_pos_in_page = -1;
+	pgoff_t new_end_page_index;
+	int new_end_pos_in_page;
+	loff_t cur_length = i_size_read(inode);
+
+	if (cur_length != 0) {
+		index = old_end_page_index =
+		    ((cur_length - 1) >> PAGE_CACHE_SHIFT);
+		old_end_pos_in_page = ((cur_length - 1) & ~PAGE_CACHE_MASK);
+	}
+	new_end_page_index = ((new_length - 1) >> PAGE_CACHE_SHIFT);
+	new_end_pos_in_page = ((new_length - 1) & ~PAGE_CACHE_MASK);
+	ecryptfs_printk(KERN_DEBUG, "old_end_page_index = [0x%.16x]; "
+			"old_end_pos_in_page = [%d]; "
+			"new_end_page_index = [0x%.16x]; "
+			"new_end_pos_in_page = [%d]\n",
+			old_end_page_index, old_end_pos_in_page,
+			new_end_page_index, new_end_pos_in_page);
+	if (old_end_page_index == new_end_page_index) {
+		/* Start and end are in the same page; we just need to
+		 * set a portion of the existing page to zero's */
+		rc = write_zeros(file, index, (old_end_pos_in_page + 1),
+				 (new_end_pos_in_page - old_end_pos_in_page));
+		if (rc)
+			ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], "
+					"index=[0x%.16x], "
+					"old_end_pos_in_page=[d], "
+					"(PAGE_CACHE_SIZE - new_end_pos_in_page"
+					"=[%d]"
+					")=[d]) returned [%d]\n", file, index,
+					old_end_pos_in_page,
+					new_end_pos_in_page,
+					(PAGE_CACHE_SIZE - new_end_pos_in_page),
+					rc);
+		goto out;
+	}
+	/* Fill the remainder of the previous last page with zeros */
+	rc = write_zeros(file, index, (old_end_pos_in_page + 1),
+			 ((PAGE_CACHE_SIZE - 1) - old_end_pos_in_page));
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], "
+				"index=[0x%.16x], old_end_pos_in_page=[d], "
+				"(PAGE_CACHE_SIZE - old_end_pos_in_page)=[d]) "
+				"returned [%d]\n", file, index,
+				old_end_pos_in_page,
+				(PAGE_CACHE_SIZE - old_end_pos_in_page), rc);
+		goto out;
+	}
+	index++;
+	while (index < new_end_page_index) {
+		/* Fill all intermediate pages with zeros */
+		rc = write_zeros(file, index, 0, PAGE_CACHE_SIZE);
+		if (rc) {
+			ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], "
+					"index=[0x%.16x], "
+					"old_end_pos_in_page=[d], "
+					"(PAGE_CACHE_SIZE - new_end_pos_in_page"
+					"=[%d]"
+					")=[d]) returned [%d]\n", file, index,
+					old_end_pos_in_page,
+					new_end_pos_in_page,
+					(PAGE_CACHE_SIZE - new_end_pos_in_page),
+					rc);
+			goto out;
+		}
+		index++;
+	}
+	/* Fill the portion at the beginning of the last new page with
+	 * zero's */
+	rc = write_zeros(file, index, 0, (new_end_pos_in_page + 1));
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "write_zeros(file="
+				"[%p], index=[0x%.16x], 0, "
+				"new_end_pos_in_page=[%d]"
+				"returned [%d]\n", file, index,
+				new_end_pos_in_page, rc);
+		goto out;
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_writepage
+ * @page: Page that is locked before this call is made
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct ecryptfs_page_crypt_context ctx;
+	int rc;
+
+	ctx.page = page;
+	ctx.mode = ECRYPTFS_WRITEPAGE_MODE;
+	ctx.param.wbc = wbc;
+	rc = ecryptfs_encrypt_page(&ctx);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error encrypting "
+				"page (upper index [0x%.16x])\n", page->index);
+		ClearPageUptodate(page);
+		goto out;
+	}
+	SetPageUptodate(page);
+	unlock_page(page);
+out:
+	return rc;
+}
+
+/**
+ * Reads the data from the lower file file at index lower_page_index
+ * and copies that data into page.
+ *
+ * @param page	Page to fill
+ * @param lower_page_index Index of the page in the lower file to get
+ */
+int ecryptfs_do_readpage(struct file *file, struct page *page,
+			 pgoff_t lower_page_index)
+{
+	int rc;
+	struct dentry *dentry;
+	struct file *lower_file;
+	struct dentry *lower_dentry;
+	struct inode *inode;
+	struct inode *lower_inode;
+	char *page_data;
+	struct page *lower_page = NULL;
+	char *lower_page_data;
+	const struct address_space_operations *lower_a_ops;
+
+	dentry = file->f_dentry;
+	lower_file = ecryptfs_file_to_lower(file);
+	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+	inode = dentry->d_inode;
+	lower_inode = ecryptfs_inode_to_lower(inode);
+	lower_a_ops = lower_inode->i_mapping->a_ops;
+	lower_page = read_cache_page(lower_inode->i_mapping, lower_page_index,
+				     (filler_t *)lower_a_ops->readpage,
+				     (void *)lower_file);
+	if (IS_ERR(lower_page)) {
+		rc = PTR_ERR(lower_page);
+		lower_page = NULL;
+		ecryptfs_printk(KERN_ERR, "Error reading from page cache\n");
+		goto out;
+	}
+	wait_on_page_locked(lower_page);
+	page_data = (char *)kmap(page);
+	if (!page_data) {
+		rc = -ENOMEM;
+		ecryptfs_printk(KERN_ERR, "Error mapping page\n");
+		goto out;
+	}
+	lower_page_data = (char *)kmap(lower_page);
+	if (!lower_page_data) {
+		rc = -ENOMEM;
+		ecryptfs_printk(KERN_ERR, "Error mapping page\n");
+		kunmap(page);
+		goto out;
+	}
+	memcpy(page_data, lower_page_data, PAGE_CACHE_SIZE);
+	kunmap(lower_page);
+	kunmap(page);
+	rc = 0;
+out:
+	if (likely(lower_page))
+		page_cache_release(lower_page);
+	if (rc == 0)
+		SetPageUptodate(page);
+	else
+		ClearPageUptodate(page);
+	return rc;
+}
+
+/**
+ * ecryptfs_readpage
+ * @file: This is an ecryptfs file
+ * @page: ecryptfs associated page to stick the read data into
+ *
+ * Read in a page, decrypting if necessary.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+static int ecryptfs_readpage(struct file *file, struct page *page)
+{
+	int rc = 0;
+	struct ecryptfs_crypt_stat *crypt_stat;
+
+	BUG_ON(!(file && file->f_dentry && file->f_dentry->d_inode));
+	crypt_stat =
+		&ecryptfs_inode_to_private(file->f_dentry->d_inode)->crypt_stat;
+	if (!crypt_stat
+	    || !ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED)
+	    || ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE)) {
+		ecryptfs_printk(KERN_DEBUG,
+				"Passing through unencrypted page\n");
+		rc = ecryptfs_do_readpage(file, page, page->index);
+		if (rc) {
+			ecryptfs_printk(KERN_ERR, "Error reading page; rc = "
+					"[%d]\n", rc);
+			goto out;
+		}
+	} else {
+		rc = ecryptfs_decrypt_page(file, page);
+		if (rc) {
+
+			ecryptfs_printk(KERN_ERR, "Error decrypting page; "
+					"rc = [%d]\n", rc);
+			goto out;
+		}
+	}
+	SetPageUptodate(page);
+out:
+	if (rc)
+		ClearPageUptodate(page);
+	ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n",
+			page->index);
+	unlock_page(page);
+	return rc;
+}
+
+static int fill_zeros_to_end_of_page(struct page *page, unsigned int to)
+{
+	struct inode *inode = page->mapping->host;
+	int end_byte_in_page;
+	int rc = 0;
+	char *page_virt;
+
+	if ((i_size_read(inode) / PAGE_CACHE_SIZE) == page->index) {
+		end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE;
+		if (to > end_byte_in_page)
+			end_byte_in_page = to;
+		page_virt = kmap(page);
+		if (!page_virt) {
+			rc = -ENOMEM;
+			ecryptfs_printk(KERN_WARNING,
+					"Could not map page\n");
+			goto out;
+		}
+		memset((page_virt + end_byte_in_page), 0,
+		       (PAGE_CACHE_SIZE - end_byte_in_page));
+		kunmap(page);
+	}
+out:
+	return rc;
+}
+
+static int ecryptfs_prepare_write(struct file *file, struct page *page,
+				  unsigned from, unsigned to)
+{
+	int rc = 0;
+
+	kmap(page);
+	if (from == 0 && to == PAGE_CACHE_SIZE)
+		goto out;	/* If we are writing a full page, it will be
+				   up to date. */
+	if (!PageUptodate(page))
+		rc = ecryptfs_do_readpage(file, page, page->index);
+out:
+	return rc;
+}
+
+int ecryptfs_grab_and_map_lower_page(struct page **lower_page,
+				     char **lower_virt,
+				     struct inode *lower_inode,
+				     unsigned long lower_page_index)
+{
+	int rc = 0;
+
+	(*lower_page) = grab_cache_page(lower_inode->i_mapping,
+					lower_page_index);
+	if (!(*lower_page)) {
+		ecryptfs_printk(KERN_ERR, "grab_cache_page for "
+				"lower_page_index = [0x%.16x] failed\n",
+				lower_page_index);
+		rc = -EINVAL;
+		goto out;
+	}
+	if (lower_virt)
+		(*lower_virt) = kmap((*lower_page));
+	else
+		kmap((*lower_page));
+out:
+	return rc;
+}
+
+int ecryptfs_writepage_and_release_lower_page(struct page *lower_page,
+					      struct inode *lower_inode,
+					      struct writeback_control *wbc)
+{
+	int rc = 0;
+
+	rc = lower_inode->i_mapping->a_ops->writepage(lower_page, wbc);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error calling lower writepage(); "
+				"rc = [%d]\n", rc);
+		goto out;
+	}
+	lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME;
+	page_cache_release(lower_page);
+out:
+	return rc;
+}
+
+static void ecryptfs_unmap_and_release_lower_page(struct page *lower_page)
+{
+	kunmap(lower_page);
+	ecryptfs_printk(KERN_DEBUG, "Unlocking lower page with index = "
+			"[0x%.16x]\n", lower_page->index);
+	unlock_page(lower_page);
+	page_cache_release(lower_page);
+}
+
+/**
+ * ecryptfs_write_inode_size_to_header
+ *
+ * Writes the lower file size to the first 8 bytes of the header.
+ *
+ * Returns zero on success; non-zero on error.
+ */
+int
+ecryptfs_write_inode_size_to_header(struct file *lower_file,
+				    struct inode *lower_inode,
+				    struct inode *inode)
+{
+	int rc = 0;
+	struct page *header_page;
+	char *header_virt;
+	const struct address_space_operations *lower_a_ops;
+	u64 file_size;
+
+	rc = ecryptfs_grab_and_map_lower_page(&header_page, &header_virt,
+					      lower_inode, 0);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "grab_cache_page for header page "
+				"failed\n");
+		goto out;
+	}
+	lower_a_ops = lower_inode->i_mapping->a_ops;
+	rc = lower_a_ops->prepare_write(lower_file, header_page, 0, 8);
+	file_size = (u64)i_size_read(inode);
+	ecryptfs_printk(KERN_DEBUG, "Writing size: [0x%.16x]\n", file_size);
+	file_size = cpu_to_be64(file_size);
+	memcpy(header_virt, &file_size, sizeof(u64));
+	rc = lower_a_ops->commit_write(lower_file, header_page, 0, 8);
+	if (rc < 0)
+		ecryptfs_printk(KERN_ERR, "Error commiting header page "
+				"write\n");
+	ecryptfs_unmap_and_release_lower_page(header_page);
+	lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME;
+	mark_inode_dirty_sync(inode);
+out:
+	return rc;
+}
+
+int ecryptfs_get_lower_page(struct page **lower_page, struct inode *lower_inode,
+			    struct file *lower_file,
+			    unsigned long lower_page_index, int byte_offset,
+			    int region_bytes)
+{
+	int rc = 0;
+
+	rc = ecryptfs_grab_and_map_lower_page(lower_page, NULL, lower_inode,
+					      lower_page_index);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error attempting to grab and map "
+				"lower page with index [0x%.16x]\n",
+				lower_page_index);
+		goto out;
+	}
+	rc = lower_inode->i_mapping->a_ops->prepare_write(lower_file,
+							  (*lower_page),
+							  byte_offset,
+							  region_bytes);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "prepare_write for "
+				"lower_page_index = [0x%.16x] failed; rc = "
+				"[%d]\n", lower_page_index, rc);
+	}
+out:
+	if (rc && (*lower_page)) {
+		ecryptfs_unmap_and_release_lower_page(*lower_page);
+		(*lower_page) = NULL;
+	}
+	return rc;
+}
+
+/**
+ * ecryptfs_commit_lower_page
+ *
+ * Returns zero on success; non-zero on error
+ */
+int
+ecryptfs_commit_lower_page(struct page *lower_page, struct inode *lower_inode,
+			   struct file *lower_file, int byte_offset,
+			   int region_size)
+{
+	int rc = 0;
+
+	rc = lower_inode->i_mapping->a_ops->commit_write(
+		lower_file, lower_page, byte_offset, region_size);
+	if (rc < 0) {
+		ecryptfs_printk(KERN_ERR,
+				"Error committing write; rc = [%d]\n", rc);
+	} else
+		rc = 0;
+	ecryptfs_unmap_and_release_lower_page(lower_page);
+	return rc;
+}
+
+/**
+ * ecryptfs_copy_page_to_lower
+ *
+ * Used for plaintext pass-through; no page index interpolation
+ * required.
+ */
+int ecryptfs_copy_page_to_lower(struct page *page, struct inode *lower_inode,
+				struct file *lower_file)
+{
+	int rc = 0;
+	struct page *lower_page;
+
+	rc = ecryptfs_get_lower_page(&lower_page, lower_inode, lower_file,
+				     page->index, 0, PAGE_CACHE_SIZE);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error attempting to get page "
+				"at index [0x%.16x]\n", page->index);
+		goto out;
+	}
+	/* TODO: aops */
+	memcpy((char *)page_address(lower_page), page_address(page),
+	       PAGE_CACHE_SIZE);
+	rc = ecryptfs_commit_lower_page(lower_page, lower_inode, lower_file,
+					0, PAGE_CACHE_SIZE);
+	if (rc)
+		ecryptfs_printk(KERN_ERR, "Error attempting to commit page "
+				"at index [0x%.16x]\n", page->index);
+out:
+	return rc;
+}
+
+static int
+process_new_file(struct ecryptfs_crypt_stat *crypt_stat,
+		 struct file *file, struct inode *inode)
+{
+	struct page *header_page;
+	const struct address_space_operations *lower_a_ops;
+	struct inode *lower_inode;
+	struct file *lower_file;
+	char *header_virt;
+	int rc = 0;
+	int current_header_page = 0;
+	int header_pages;
+	int more_header_data_to_be_written = 1;
+
+	lower_inode = ecryptfs_inode_to_lower(inode);
+	lower_file = ecryptfs_file_to_lower(file);
+	lower_a_ops = lower_inode->i_mapping->a_ops;
+	header_pages = ((crypt_stat->header_extent_size
+			 * crypt_stat->num_header_extents_at_front)
+			/ PAGE_CACHE_SIZE);
+	BUG_ON(header_pages < 1);
+	while (current_header_page < header_pages) {
+		rc = ecryptfs_grab_and_map_lower_page(&header_page,
+						      &header_virt,
+						      lower_inode,
+						      current_header_page);
+		if (rc) {
+			ecryptfs_printk(KERN_ERR, "grab_cache_page for "
+					"header page [%d] failed; rc = [%d]\n",
+					current_header_page, rc);
+			goto out;
+		}
+		rc = lower_a_ops->prepare_write(lower_file, header_page, 0,
+						PAGE_CACHE_SIZE);
+		if (rc) {
+			ecryptfs_printk(KERN_ERR, "Error preparing to write "
+					"header page out; rc = [%d]\n", rc);
+			goto out;
+		}
+		memset(header_virt, 0, PAGE_CACHE_SIZE);
+		if (more_header_data_to_be_written) {
+			rc = ecryptfs_write_headers_virt(header_virt,
+							 crypt_stat,
+							 file->f_dentry);
+			if (rc) {
+				ecryptfs_printk(KERN_WARNING, "Error "
+						"generating header; rc = "
+						"[%d]\n", rc);
+				rc = -EIO;
+				memset(header_virt, 0, PAGE_CACHE_SIZE);
+				ecryptfs_unmap_and_release_lower_page(
+					header_page);
+				goto out;
+			}
+			if (current_header_page == 0)
+				memset(header_virt, 0, 8);
+			more_header_data_to_be_written = 0;
+		}
+		rc = lower_a_ops->commit_write(lower_file, header_page, 0,
+					       PAGE_CACHE_SIZE);
+		ecryptfs_unmap_and_release_lower_page(header_page);
+		if (rc < 0) {
+			ecryptfs_printk(KERN_ERR,
+					"Error commiting header page write; "
+					"rc = [%d]\n", rc);
+			break;
+		}
+		current_header_page++;
+	}
+	if (rc >= 0) {
+		rc = 0;
+		ecryptfs_printk(KERN_DEBUG, "lower_inode->i_blocks = "
+				"[0x%.16x]\n", lower_inode->i_blocks);
+		i_size_write(inode, 0);
+		lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME;
+		mark_inode_dirty_sync(inode);
+	}
+	ecryptfs_printk(KERN_DEBUG, "Clearing ECRYPTFS_NEW_FILE flag in "
+			"crypt_stat at memory location [%p]\n", crypt_stat);
+	ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE);
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_commit_write
+ * @file: The eCryptfs file object
+ * @page: The eCryptfs page
+ * @from: Ignored (we rotate the page IV on each write)
+ * @to: Ignored
+ *
+ * This is where we encrypt the data and pass the encrypted data to
+ * the lower filesystem.  In OpenPGP-compatible mode, we operate on
+ * entire underlying packets.
+ */
+static int ecryptfs_commit_write(struct file *file, struct page *page,
+				 unsigned from, unsigned to)
+{
+	struct ecryptfs_page_crypt_context ctx;
+	loff_t pos;
+	struct inode *inode;
+	struct inode *lower_inode;
+	struct file *lower_file;
+	struct ecryptfs_crypt_stat *crypt_stat;
+	int rc;
+
+	inode = page->mapping->host;
+	lower_inode = ecryptfs_inode_to_lower(inode);
+	lower_file = ecryptfs_file_to_lower(file);
+	mutex_lock(&lower_inode->i_mutex);
+	crypt_stat =
+		&ecryptfs_inode_to_private(file->f_dentry->d_inode)->crypt_stat;
+	if (ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE)) {
+		ecryptfs_printk(KERN_DEBUG, "ECRYPTFS_NEW_FILE flag set in "
+			"crypt_stat at memory location [%p]\n", crypt_stat);
+		rc = process_new_file(crypt_stat, file, inode);
+		if (rc) {
+			ecryptfs_printk(KERN_ERR, "Error processing new "
+					"file; rc = [%d]\n", rc);
+			goto out;
+		}
+	} else
+		ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
+	ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
+			"(page w/ index = [0x%.16x], to = [%d])\n", page->index,
+			to);
+	rc = fill_zeros_to_end_of_page(page, to);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error attempting to fill "
+				"zeros in page with index = [0x%.16x]\n",
+				page->index);
+		goto out;
+	}
+	ctx.page = page;
+	ctx.mode = ECRYPTFS_PREPARE_COMMIT_MODE;
+	ctx.param.lower_file = lower_file;
+	rc = ecryptfs_encrypt_page(&ctx);
+	if (rc) {
+		ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
+				"index [0x%.16x])\n", page->index);
+		goto out;
+	}
+	rc = 0;
+	inode->i_blocks = lower_inode->i_blocks;
+	pos = (page->index << PAGE_CACHE_SHIFT) + to;
+	if (pos > i_size_read(inode)) {
+		i_size_write(inode, pos);
+		ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
+				"[0x%.16x]\n", i_size_read(inode));
+	}
+	ecryptfs_write_inode_size_to_header(lower_file, lower_inode, inode);
+	lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME;
+	mark_inode_dirty_sync(inode);
+out:
+	kunmap(page); /* mapped in prior call (prepare_write) */
+	if (rc < 0)
+		ClearPageUptodate(page);
+	else
+		SetPageUptodate(page);
+	mutex_unlock(&lower_inode->i_mutex);
+	return rc;
+}
+
+/**
+ * write_zeros
+ * @file: The ecryptfs file
+ * @index: The index in which we are writing
+ * @start: The position after the last block of data
+ * @num_zeros: The number of zeros to write
+ *
+ * Write a specified number of zero's to a page.
+ *
+ * (start + num_zeros) must be less than or equal to PAGE_CACHE_SIZE
+ */
+static
+int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros)
+{
+	int rc = 0;
+	struct page *tmp_page;
+
+	tmp_page = ecryptfs_get1page(file, index);
+	if (IS_ERR(tmp_page)) {
+		ecryptfs_printk(KERN_ERR, "Error getting page at index "
+				"[0x%.16x]\n", index);
+		rc = PTR_ERR(tmp_page);
+		goto out;
+	}
+	kmap(tmp_page);
+	rc = ecryptfs_prepare_write(file, tmp_page, start, start + num_zeros);
+	if (rc) {
+		ecryptfs_printk(KERN_ERR, "Error preparing to write zero's "
+				"to remainder of page at index [0x%.16x]\n",
+				index);
+		kunmap(tmp_page);
+		page_cache_release(tmp_page);
+		goto out;
+	}
+	memset(((char *)page_address(tmp_page) + start), 0, num_zeros);
+	rc = ecryptfs_commit_write(file, tmp_page, start, start + num_zeros);
+	if (rc < 0) {
+		ecryptfs_printk(KERN_ERR, "Error attempting to write zero's "
+				"to remainder of page at index [0x%.16x]\n",
+				index);
+		kunmap(tmp_page);
+		page_cache_release(tmp_page);
+		goto out;
+	}
+	rc = 0;
+	kunmap(tmp_page);
+	page_cache_release(tmp_page);
+out:
+	return rc;
+}
+
+static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block)
+{
+	int rc = 0;
+	struct inode *inode;
+	struct inode *lower_inode;
+
+	inode = (struct inode *)mapping->host;
+	lower_inode = ecryptfs_inode_to_lower(inode);
+	if (lower_inode->i_mapping->a_ops->bmap)
+		rc = lower_inode->i_mapping->a_ops->bmap(lower_inode->i_mapping,
+							 block);
+	return rc;
+}
+
+static void ecryptfs_sync_page(struct page *page)
+{
+	struct inode *inode;
+	struct inode *lower_inode;
+	struct page *lower_page;
+
+	inode = page->mapping->host;
+	lower_inode = ecryptfs_inode_to_lower(inode);
+	/* NOTE: Recently swapped with grab_cache_page(), since
+	 * sync_page() just makes sure that pending I/O gets done. */
+	lower_page = find_lock_page(lower_inode->i_mapping, page->index);
+	if (!lower_page) {
+		ecryptfs_printk(KERN_DEBUG, "find_lock_page failed\n");
+		return;
+	}
+	lower_page->mapping->a_ops->sync_page(lower_page);
+	ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n",
+			lower_page->index);
+	unlock_page(lower_page);
+	page_cache_release(lower_page);
+}
+
+struct address_space_operations ecryptfs_aops = {
+	.writepage = ecryptfs_writepage,
+	.readpage = ecryptfs_readpage,
+	.prepare_write = ecryptfs_prepare_write,
+	.commit_write = ecryptfs_commit_write,
+	.bmap = ecryptfs_bmap,
+	.sync_page = ecryptfs_sync_page,
+};
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
new file mode 100644
index 000000000000..c337c0410fb1
--- /dev/null
+++ b/fs/ecryptfs/super.c
@@ -0,0 +1,198 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 1997-2003 Erez Zadok
+ * Copyright (C) 2001-2003 Stony Brook University
+ * Copyright (C) 2004-2006 International Business Machines Corp.
+ *   Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *              Michael C. Thompson <mcthomps@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/key.h>
+#include <linux/seq_file.h>
+#include <linux/crypto.h>
+#include "ecryptfs_kernel.h"
+
+struct kmem_cache *ecryptfs_inode_info_cache;
+
+/**
+ * ecryptfs_alloc_inode - allocate an ecryptfs inode
+ * @sb: Pointer to the ecryptfs super block
+ *
+ * Called to bring an inode into existence.
+ *
+ * Only handle allocation, setting up structures should be done in
+ * ecryptfs_read_inode. This is because the kernel, between now and
+ * then, will 0 out the private data pointer.
+ *
+ * Returns a pointer to a newly allocated inode, NULL otherwise
+ */
+static struct inode *ecryptfs_alloc_inode(struct super_block *sb)
+{
+	struct ecryptfs_inode_info *ecryptfs_inode;
+	struct inode *inode = NULL;
+
+	ecryptfs_inode = kmem_cache_alloc(ecryptfs_inode_info_cache,
+					  SLAB_KERNEL);
+	if (unlikely(!ecryptfs_inode))
+		goto out;
+	ecryptfs_init_crypt_stat(&ecryptfs_inode->crypt_stat);
+	inode = &ecryptfs_inode->vfs_inode;
+out:
+	return inode;
+}
+
+/**
+ * ecryptfs_destroy_inode
+ * @inode: The ecryptfs inode
+ *
+ * This is used during the final destruction of the inode.
+ * All allocation of memory related to the inode, including allocated
+ * memory in the crypt_stat struct, will be released here.
+ * There should be no chance that this deallocation will be missed.
+ */
+static void ecryptfs_destroy_inode(struct inode *inode)
+{
+	struct ecryptfs_inode_info *inode_info;
+
+	inode_info = ecryptfs_inode_to_private(inode);
+	ecryptfs_destruct_crypt_stat(&inode_info->crypt_stat);
+	kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
+}
+
+/**
+ * ecryptfs_init_inode
+ * @inode: The ecryptfs inode
+ *
+ * Set up the ecryptfs inode.
+ */
+void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
+{
+	ecryptfs_set_inode_lower(inode, lower_inode);
+	inode->i_ino = lower_inode->i_ino;
+	inode->i_version++;
+	inode->i_op = &ecryptfs_main_iops;
+	inode->i_fop = &ecryptfs_main_fops;
+	inode->i_mapping->a_ops = &ecryptfs_aops;
+}
+
+/**
+ * ecryptfs_put_super
+ * @sb: Pointer to the ecryptfs super block
+ *
+ * Final actions when unmounting a file system.
+ * This will handle deallocation and release of our private data.
+ */
+static void ecryptfs_put_super(struct super_block *sb)
+{
+	struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
+
+	ecryptfs_destruct_mount_crypt_stat(&sb_info->mount_crypt_stat);
+	kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
+	ecryptfs_set_superblock_private(sb, NULL);
+}
+
+/**
+ * ecryptfs_statfs
+ * @sb: The ecryptfs super block
+ * @buf: The struct kstatfs to fill in with stats
+ *
+ * Get the filesystem statistics. Currently, we let this pass right through
+ * to the lower filesystem and take no action ourselves.
+ */
+static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	return vfs_statfs(ecryptfs_dentry_to_lower(dentry), buf);
+}
+
+/**
+ * ecryptfs_clear_inode
+ * @inode - The ecryptfs inode
+ *
+ * Called by iput() when the inode reference count reached zero
+ * and the inode is not hashed anywhere.  Used to clear anything
+ * that needs to be, before the inode is completely destroyed and put
+ * on the inode free list. We use this to drop out reference to the
+ * lower inode.
+ */
+static void ecryptfs_clear_inode(struct inode *inode)
+{
+	iput(ecryptfs_inode_to_lower(inode));
+}
+
+/**
+ * ecryptfs_umount_begin
+ *
+ * Called in do_umount().
+ */
+static void ecryptfs_umount_begin(struct vfsmount *vfsmnt, int flags)
+{
+	struct vfsmount *lower_mnt =
+		ecryptfs_dentry_to_lower_mnt(vfsmnt->mnt_sb->s_root);
+	struct super_block *lower_sb;
+
+	mntput(lower_mnt);
+	lower_sb = lower_mnt->mnt_sb;
+	if (lower_sb->s_op->umount_begin)
+		lower_sb->s_op->umount_begin(lower_mnt, flags);
+}
+
+/**
+ * ecryptfs_show_options
+ *
+ * Prints the directory we are currently mounted over.
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct super_block *sb = mnt->mnt_sb;
+	struct dentry *lower_root_dentry = ecryptfs_dentry_to_lower(sb->s_root);
+	struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(sb->s_root);
+	char *tmp_page;
+	char *path;
+	int rc = 0;
+
+	tmp_page = (char *)__get_free_page(GFP_KERNEL);
+	if (!tmp_page) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	path = d_path(lower_root_dentry, lower_mnt, tmp_page, PAGE_SIZE);
+	if (IS_ERR(path)) {
+		rc = PTR_ERR(path);
+		goto out;
+	}
+	seq_printf(m, ",dir=%s", path);
+	free_page((unsigned long)tmp_page);
+out:
+	return rc;
+}
+
+struct super_operations ecryptfs_sops = {
+	.alloc_inode = ecryptfs_alloc_inode,
+	.destroy_inode = ecryptfs_destroy_inode,
+	.drop_inode = generic_delete_inode,
+	.put_super = ecryptfs_put_super,
+	.statfs = ecryptfs_statfs,
+	.remount_fs = NULL,
+	.clear_inode = ecryptfs_clear_inode,
+	.umount_begin = ecryptfs_umount_begin,
+	.show_options = ecryptfs_show_options
+};
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 8ac2462ae5dd..b3f50651eb6b 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -90,8 +90,7 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(efs_inode_cachep))
-		printk(KERN_INFO "efs_inode_cache: not all structures were freed\n");
+	kmem_cache_destroy(efs_inode_cachep);
 }
 
 static void efs_put_super(struct super_block *s)
@@ -248,11 +247,10 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
 	struct buffer_head *bh;
 	struct inode *root;
 
- 	sb = kmalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
+ 	sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
 	if (!sb)
 		return -ENOMEM;
 	s->s_fs_info = sb;
-	memset(sb, 0, sizeof(struct efs_sb_info));
  
 	s->s_magic		= EFS_SUPER_MAGIC;
 	if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) {
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3a3567433b92..ae228ec54e94 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -105,6 +105,8 @@
 /* Maximum msec timeout value storeable in a long int */
 #define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
 
+#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
+
 
 struct epoll_filefd {
 	struct file *file;
@@ -497,7 +499,7 @@ void eventpoll_release_file(struct file *file)
  */
 asmlinkage long sys_epoll_create(int size)
 {
-	int error, fd;
+	int error, fd = -1;
 	struct eventpoll *ep;
 	struct inode *inode;
 	struct file *file;
@@ -640,7 +642,6 @@ eexit_1:
 	return error;
 }
 
-#define MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
 
 /*
  * Implement the event wait interface for the eventpoll file. It is the kernel
@@ -657,7 +658,7 @@ asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
 		     current, epfd, events, maxevents, timeout));
 
 	/* The maximum number of event must be greater than zero */
-	if (maxevents <= 0 || maxevents > MAX_EVENTS)
+	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
 		return -EINVAL;
 
 	/* Verify that the area passed by the user is writeable */
@@ -699,6 +700,55 @@ eexit_1:
 }
 
 
+#ifdef TIF_RESTORE_SIGMASK
+
+/*
+ * Implement the event wait interface for the eventpoll file. It is the kernel
+ * part of the user space epoll_pwait(2).
+ */
+asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
+		int maxevents, int timeout, const sigset_t __user *sigmask,
+		size_t sigsetsize)
+{
+	int error;
+	sigset_t ksigmask, sigsaved;
+
+	/*
+	 * If the caller wants a certain signal mask to be set during the wait,
+	 * we apply it here.
+	 */
+	if (sigmask) {
+		if (sigsetsize != sizeof(sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
+			return -EFAULT;
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	error = sys_epoll_wait(epfd, events, maxevents, timeout);
+
+	/*
+	 * If we changed the signal mask, we need to restore the original one.
+	 * In case we've got a signal while waiting, we do not restore the
+	 * signal mask yet, and we allow do_signal() to deliver the signal on
+	 * the way back to userspace, before the signal mask is restored.
+	 */
+	if (sigmask) {
+		if (error == -EINTR) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+				sizeof(sigsaved));
+			set_thread_flag(TIF_RESTORE_SIGMASK);
+		} else
+			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+	}
+
+	return error;
+}
+
+#endif /* #ifdef TIF_RESTORE_SIGMASK */
+
+
 /*
  * Creates the file descriptor to be used by the epoll interface.
  */
@@ -720,9 +770,10 @@ static int ep_getfd(int *efd, struct inode **einode, struct file **efile,
 
 	/* Allocates an inode from the eventpoll file system */
 	inode = ep_eventpoll_inode();
-	error = PTR_ERR(inode);
-	if (IS_ERR(inode))
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
 		goto eexit_2;
+	}
 
 	/* Allocates a free descriptor to plug the file onto */
 	error = get_unused_fd();
@@ -1590,7 +1641,6 @@ static struct inode *ep_eventpoll_inode(void)
 	inode->i_uid = current->fsuid;
 	inode->i_gid = current->fsgid;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	inode->i_blksize = PAGE_SIZE;
 	return inode;
 
 eexit_1:
diff --git a/fs/exec.c b/fs/exec.c
index 54135df2a966..d993ea1a81ae 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -46,7 +46,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/rmap.h>
-#include <linux/acct.h>
+#include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
 
@@ -58,7 +58,7 @@
 #endif
 
 int core_uses_pid;
-char core_pattern[65] = "core";
+char core_pattern[128] = "core";
 int suid_dumpable = 0;
 
 EXPORT_SYMBOL(suid_dumpable);
@@ -595,7 +595,7 @@ static int de_thread(struct task_struct *tsk)
 	if (!newsighand)
 		return -ENOMEM;
 
-	if (thread_group_empty(current))
+	if (thread_group_empty(tsk))
 		goto no_thread_group;
 
 	/*
@@ -620,17 +620,17 @@ static int de_thread(struct task_struct *tsk)
 	 * Reparenting needs write_lock on tasklist_lock,
 	 * so it is safe to do it under read_lock.
 	 */
-	if (unlikely(current->group_leader == child_reaper))
-		child_reaper = current;
+	if (unlikely(tsk->group_leader == child_reaper))
+		child_reaper = tsk;
 
-	zap_other_threads(current);
+	zap_other_threads(tsk);
 	read_unlock(&tasklist_lock);
 
 	/*
 	 * Account for the thread group leader hanging around:
 	 */
 	count = 1;
-	if (!thread_group_leader(current)) {
+	if (!thread_group_leader(tsk)) {
 		count = 2;
 		/*
 		 * The SIGALRM timer survives the exec, but needs to point
@@ -639,14 +639,14 @@ static int de_thread(struct task_struct *tsk)
 		 * synchronize with any firing (by calling del_timer_sync)
 		 * before we can safely let the old group leader die.
 		 */
-		sig->tsk = current;
+		sig->tsk = tsk;
 		spin_unlock_irq(lock);
 		if (hrtimer_cancel(&sig->real_timer))
 			hrtimer_restart(&sig->real_timer);
 		spin_lock_irq(lock);
 	}
 	while (atomic_read(&sig->count) > count) {
-		sig->group_exit_task = current;
+		sig->group_exit_task = tsk;
 		sig->notify_count = count;
 		__set_current_state(TASK_UNINTERRUPTIBLE);
 		spin_unlock_irq(lock);
@@ -662,13 +662,13 @@ static int de_thread(struct task_struct *tsk)
 	 * do is to wait for the thread group leader to become inactive,
 	 * and to assume its PID:
 	 */
-	if (!thread_group_leader(current)) {
+	if (!thread_group_leader(tsk)) {
 		/*
 		 * Wait for the thread group leader to be a zombie.
 		 * It should already be zombie at this point, most
 		 * of the time.
 		 */
-		leader = current->group_leader;
+		leader = tsk->group_leader;
 		while (leader->exit_state != EXIT_ZOMBIE)
 			yield();
 
@@ -682,12 +682,12 @@ static int de_thread(struct task_struct *tsk)
 		 * When we take on its identity by switching to its PID, we
 		 * also take its birthdate (always earlier than our own).
 		 */
-		current->start_time = leader->start_time;
+		tsk->start_time = leader->start_time;
 
 		write_lock_irq(&tasklist_lock);
 
-		BUG_ON(leader->tgid != current->tgid);
-		BUG_ON(current->pid == current->tgid);
+		BUG_ON(leader->tgid != tsk->tgid);
+		BUG_ON(tsk->pid == tsk->tgid);
 		/*
 		 * An exec() starts a new thread group with the
 		 * TGID of the previous thread group. Rehash the
@@ -696,24 +696,21 @@ static int de_thread(struct task_struct *tsk)
 		 */
 
 		/* Become a process group leader with the old leader's pid.
-		 * Note: The old leader also uses thispid until release_task
+		 * The old leader becomes a thread of the this thread group.
+		 * Note: The old leader also uses this pid until release_task
 		 *       is called.  Odd but simple and correct.
 		 */
-		detach_pid(current, PIDTYPE_PID);
-		current->pid = leader->pid;
-		attach_pid(current, PIDTYPE_PID,  current->pid);
-		attach_pid(current, PIDTYPE_PGID, current->signal->pgrp);
-		attach_pid(current, PIDTYPE_SID,  current->signal->session);
-		list_replace_rcu(&leader->tasks, &current->tasks);
+		detach_pid(tsk, PIDTYPE_PID);
+		tsk->pid = leader->pid;
+		attach_pid(tsk, PIDTYPE_PID,  tsk->pid);
+		transfer_pid(leader, tsk, PIDTYPE_PGID);
+		transfer_pid(leader, tsk, PIDTYPE_SID);
+		list_replace_rcu(&leader->tasks, &tsk->tasks);
 
-		current->group_leader = current;
-		leader->group_leader = current;
+		tsk->group_leader = tsk;
+		leader->group_leader = tsk;
 
-		/* Reduce leader to a thread */
-		detach_pid(leader, PIDTYPE_PGID);
-		detach_pid(leader, PIDTYPE_SID);
-
-		current->exit_signal = SIGCHLD;
+		tsk->exit_signal = SIGCHLD;
 
 		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
 		leader->exit_state = EXIT_DEAD;
@@ -753,7 +750,7 @@ no_thread_group:
 		spin_lock(&oldsighand->siglock);
 		spin_lock_nested(&newsighand->siglock, SINGLE_DEPTH_NESTING);
 
-		rcu_assign_pointer(current->sighand, newsighand);
+		rcu_assign_pointer(tsk->sighand, newsighand);
 		recalc_sigpending();
 
 		spin_unlock(&newsighand->siglock);
@@ -764,7 +761,7 @@ no_thread_group:
 			kmem_cache_free(sighand_cachep, oldsighand);
 	}
 
-	BUG_ON(!thread_group_leader(current));
+	BUG_ON(!thread_group_leader(tsk));
 	return 0;
 }
 	
@@ -901,8 +898,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 	return 0;
 
 mmap_failed:
-	put_files_struct(current->files);
-	current->files = files;
+	reset_files_struct(current, files);
 out:
 	return retval;
 }
@@ -1322,7 +1318,7 @@ static void format_corename(char *corename, const char *pattern, long signr)
 			case 'h':
 				down_read(&uts_sem);
 				rc = snprintf(out_ptr, out_end - out_ptr,
-					      "%s", system_utsname.nodename);
+					      "%s", utsname()->nodename);
 				up_read(&uts_sem);
 				if (rc > out_end - out_ptr)
 					goto out;
@@ -1467,6 +1463,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	int retval = 0;
 	int fsuid = current->fsuid;
 	int flag = 0;
+	int ispipe = 0;
 
 	binfmt = current->binfmt;
 	if (!binfmt || !binfmt->core_dump)
@@ -1508,22 +1505,34 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
  	lock_kernel();
 	format_corename(corename, core_pattern, signr);
 	unlock_kernel();
-	file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 0600);
+ 	if (corename[0] == '|') {
+		/* SIGPIPE can happen, but it's just never processed */
+ 		if(call_usermodehelper_pipe(corename+1, NULL, NULL, &file)) {
+ 			printk(KERN_INFO "Core dump to %s pipe failed\n",
+			       corename);
+ 			goto fail_unlock;
+ 		}
+		ispipe = 1;
+ 	} else
+ 		file = filp_open(corename,
+				 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE, 0600);
 	if (IS_ERR(file))
 		goto fail_unlock;
 	inode = file->f_dentry->d_inode;
 	if (inode->i_nlink > 1)
 		goto close_fail;	/* multiple links - don't dump */
-	if (d_unhashed(file->f_dentry))
+	if (!ispipe && d_unhashed(file->f_dentry))
 		goto close_fail;
 
-	if (!S_ISREG(inode->i_mode))
+	/* AK: actually i see no reason to not allow this for named pipes etc.,
+	   but keep the previous behaviour for now. */
+	if (!ispipe && !S_ISREG(inode->i_mode))
 		goto close_fail;
 	if (!file->f_op)
 		goto close_fail;
 	if (!file->f_op->write)
 		goto close_fail;
-	if (do_truncate(file->f_dentry, 0, 0, file) != 0)
+	if (!ispipe && do_truncate(file->f_dentry, 0, 0, file) != 0)
 		goto close_fail;
 
 	retval = binfmt->core_dump(signr, regs, file);
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 4c39009350f3..93e77c3d2490 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -315,7 +315,7 @@ struct getdents_callback {
  * the name matching the specified inode number.
  */
 static int filldir_one(void * __buf, const char * name, int len,
-			loff_t pos, ino_t ino, unsigned int d_type)
+			loff_t pos, u64 ino, unsigned int d_type)
 {
 	struct getdents_callback *buf = __buf;
 	int result = 0;
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index da52b4a5db64..7c420b800c34 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -89,8 +89,8 @@ ext2_acl_to_disk(const struct posix_acl *acl, size_t *size)
 	size_t n;
 
 	*size = ext2_acl_size(acl->a_count);
-	ext_acl = (ext2_acl_header *)kmalloc(sizeof(ext2_acl_header) +
-		acl->a_count * sizeof(ext2_acl_entry), GFP_KERNEL);
+	ext_acl = kmalloc(sizeof(ext2_acl_header) + acl->a_count *
+			sizeof(ext2_acl_entry), GFP_KERNEL);
 	if (!ext_acl)
 		return ERR_PTR(-ENOMEM);
 	ext_acl->a_version = cpu_to_le32(EXT2_ACL_VERSION);
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 92ea8265d7d5..3e7a84a1e509 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -661,5 +661,8 @@ const struct file_operations ext2_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= ext2_readdir,
 	.ioctl		= ext2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext2_compat_ioctl,
+#endif
 	.fsync		= ext2_sync_file,
 };
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index e65a019fc7a5..c19ac153f56b 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -137,6 +137,7 @@ extern void ext2_set_inode_flags(struct inode *inode);
 /* ioctl.c */
 extern int ext2_ioctl (struct inode *, struct file *, unsigned int,
 		       unsigned long);
+extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long);
 
 /* namei.c */
 struct dentry *ext2_get_parent(struct dentry *child);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 23e2c7ccec1d..2dba473c524a 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -41,17 +41,18 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
  */
 const struct file_operations ext2_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= generic_file_aio_write,
 	.ioctl		= ext2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext2_compat_ioctl,
+#endif
 	.mmap		= generic_file_mmap,
 	.open		= generic_file_open,
 	.release	= ext2_release_file,
 	.fsync		= ext2_sync_file,
-	.readv		= generic_file_readv,
-	.writev		= generic_file_writev,
 	.sendfile	= generic_file_sendfile,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
@@ -63,6 +64,9 @@ const struct file_operations ext2_xip_file_operations = {
 	.read		= xip_file_read,
 	.write		= xip_file_write,
 	.ioctl		= ext2_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext2_compat_ioctl,
+#endif
 	.mmap		= xip_file_mmap,
 	.open		= generic_file_open,
 	.release	= ext2_release_file,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 695f69ccf908..2cb545bf0f3c 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -574,7 +574,6 @@ got:
 	inode->i_mode = mode;
 
 	inode->i_ino = ino;
-	inode->i_blksize = PAGE_SIZE;	/* This is the optimal IO size (for stat), not the fs block size */
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	memset(ei->i_data, 0, sizeof(ei->i_data));
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fb4d3220eb8d..dd4e14c221e0 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1094,7 +1094,6 @@ void ext2_read_inode (struct inode * inode)
 		brelse (bh);
 		goto bad_inode;
 	}
-	inode->i_blksize = PAGE_SIZE;	/* This is the optimal IO size (for stat), not the fs block size */
 	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
 	ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 3ca9afdf713d..1dfba77eab10 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -11,6 +11,8 @@
 #include <linux/capability.h>
 #include <linux/time.h>
 #include <linux/sched.h>
+#include <linux/compat.h>
+#include <linux/smp_lock.h>
 #include <asm/current.h>
 #include <asm/uaccess.h>
 
@@ -80,3 +82,33 @@ int ext2_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		return -ENOTTY;
 	}
 }
+
+#ifdef CONFIG_COMPAT
+long ext2_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	int ret;
+
+	/* These are just misnamed, they actually get/put from/to user an int */
+	switch (cmd) {
+	case EXT2_IOC32_GETFLAGS:
+		cmd = EXT2_IOC_GETFLAGS;
+		break;
+	case EXT2_IOC32_SETFLAGS:
+		cmd = EXT2_IOC_SETFLAGS;
+		break;
+	case EXT2_IOC32_GETVERSION:
+		cmd = EXT2_IOC_GETVERSION;
+		break;
+	case EXT2_IOC32_SETVERSION:
+		cmd = EXT2_IOC_SETVERSION;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	lock_kernel();
+	ret = ext2_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
+	unlock_kernel();
+	return ret;
+}
+#endif
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 4ca824985321..e1af5b4cf80c 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -326,7 +326,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 		ext2_set_link(new_dir, new_de, new_page, old_inode);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
-			new_inode->i_nlink--;
+			drop_nlink(new_inode);
 		inode_dec_link_count(new_inode);
 	} else {
 		if (dir_de) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 4286ff6330b6..d8b9abd95d07 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -184,8 +184,7 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(ext2_inode_cachep))
-		printk(KERN_INFO "ext2_inode_cache: not all structures were freed\n");
+	kmem_cache_destroy(ext2_inode_cachep);
 }
 
 static void ext2_clear_inode(struct inode *inode)
@@ -365,7 +364,6 @@ static int parse_options (char * options,
 {
 	char * p;
 	substring_t args[MAX_OPT_ARGS];
-	unsigned long kind = EXT2_MOUNT_ERRORS_CONT;
 	int option;
 
 	if (!options)
@@ -405,13 +403,19 @@ static int parse_options (char * options,
 			/* *sb_block = match_int(&args[0]); */
 			break;
 		case Opt_err_panic:
-			kind = EXT2_MOUNT_ERRORS_PANIC;
+			clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt (sbi->s_mount_opt, ERRORS_RO);
+			set_opt (sbi->s_mount_opt, ERRORS_PANIC);
 			break;
 		case Opt_err_ro:
-			kind = EXT2_MOUNT_ERRORS_RO;
+			clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+			set_opt (sbi->s_mount_opt, ERRORS_RO);
 			break;
 		case Opt_err_cont:
-			kind = EXT2_MOUNT_ERRORS_CONT;
+			clear_opt (sbi->s_mount_opt, ERRORS_RO);
+			clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+			set_opt (sbi->s_mount_opt, ERRORS_CONT);
 			break;
 		case Opt_nouid32:
 			set_opt (sbi->s_mount_opt, NO_UID32);
@@ -490,7 +494,6 @@ static int parse_options (char * options,
 			return 0;
 		}
 	}
-	sbi->s_mount_opt |= kind;
 	return 1;
 }
 
@@ -544,17 +547,24 @@ static int ext2_check_descriptors (struct super_block * sb)
 	int i;
 	int desc_block = 0;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
-	unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
+	unsigned long first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
+	unsigned long last_block;
 	struct ext2_group_desc * gdp = NULL;
 
 	ext2_debug ("Checking group descriptors");
 
 	for (i = 0; i < sbi->s_groups_count; i++)
 	{
+		if (i == sbi->s_groups_count - 1)
+			last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1;
+		else
+			last_block = first_block +
+				(EXT2_BLOCKS_PER_GROUP(sb) - 1);
+
 		if ((i % EXT2_DESC_PER_BLOCK(sb)) == 0)
 			gdp = (struct ext2_group_desc *) sbi->s_group_desc[desc_block++]->b_data;
-		if (le32_to_cpu(gdp->bg_block_bitmap) < block ||
-		    le32_to_cpu(gdp->bg_block_bitmap) >= block + EXT2_BLOCKS_PER_GROUP(sb))
+		if (le32_to_cpu(gdp->bg_block_bitmap) < first_block ||
+		    le32_to_cpu(gdp->bg_block_bitmap) > last_block)
 		{
 			ext2_error (sb, "ext2_check_descriptors",
 				    "Block bitmap for group %d"
@@ -562,8 +572,8 @@ static int ext2_check_descriptors (struct super_block * sb)
 				    i, (unsigned long) le32_to_cpu(gdp->bg_block_bitmap));
 			return 0;
 		}
-		if (le32_to_cpu(gdp->bg_inode_bitmap) < block ||
-		    le32_to_cpu(gdp->bg_inode_bitmap) >= block + EXT2_BLOCKS_PER_GROUP(sb))
+		if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block ||
+		    le32_to_cpu(gdp->bg_inode_bitmap) > last_block)
 		{
 			ext2_error (sb, "ext2_check_descriptors",
 				    "Inode bitmap for group %d"
@@ -571,9 +581,9 @@ static int ext2_check_descriptors (struct super_block * sb)
 				    i, (unsigned long) le32_to_cpu(gdp->bg_inode_bitmap));
 			return 0;
 		}
-		if (le32_to_cpu(gdp->bg_inode_table) < block ||
-		    le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >=
-		    block + EXT2_BLOCKS_PER_GROUP(sb))
+		if (le32_to_cpu(gdp->bg_inode_table) < first_block ||
+		    le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >
+		    last_block)
 		{
 			ext2_error (sb, "ext2_check_descriptors",
 				    "Inode table for group %d"
@@ -581,7 +591,7 @@ static int ext2_check_descriptors (struct super_block * sb)
 				    i, (unsigned long) le32_to_cpu(gdp->bg_inode_table));
 			return 0;
 		}
-		block += EXT2_BLOCKS_PER_GROUP(sb);
+		first_block += EXT2_BLOCKS_PER_GROUP(sb);
 		gdp++;
 	}
 	return 1;
@@ -648,11 +658,10 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	int i, j;
 	__le32 features;
 
-	sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
 	sb->s_fs_info = sbi;
-	memset(sbi, 0, sizeof(*sbi));
 
 	/*
 	 * See what the current blocksize for the device is, and
@@ -710,6 +719,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		set_opt(sbi->s_mount_opt, ERRORS_PANIC);
 	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_RO)
 		set_opt(sbi->s_mount_opt, ERRORS_RO);
+	else
+		set_opt(sbi->s_mount_opt, ERRORS_CONT);
 
 	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
 	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -861,10 +872,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 
 	if (EXT2_BLOCKS_PER_GROUP(sb) == 0)
 		goto cantfind_ext2;
-	sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
-				        le32_to_cpu(es->s_first_data_block) +
-				       EXT2_BLOCKS_PER_GROUP(sb) - 1) /
-				       EXT2_BLOCKS_PER_GROUP(sb);
+ 	sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
+ 				le32_to_cpu(es->s_first_data_block) - 1)
+ 					/ EXT2_BLOCKS_PER_GROUP(sb)) + 1;
 	db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
 		   EXT2_DESC_PER_BLOCK(sb);
 	sbi->s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL);
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 86ae8e93adb9..af52a7f8b291 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -521,11 +521,10 @@ bad_block:		ext2_error(sb, "ext2_xattr_set",
 		}
 	} else {
 		/* Allocate a buffer where we construct the new block. */
-		header = kmalloc(sb->s_blocksize, GFP_KERNEL);
+		header = kzalloc(sb->s_blocksize, GFP_KERNEL);
 		error = -ENOMEM;
 		if (header == NULL)
 			goto cleanup;
-		memset(header, 0, sb->s_blocksize);
 		end = (char *)header + sb->s_blocksize;
 		header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC);
 		header->h_blocks = header->h_refcount = cpu_to_le32(1);
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 0d21d558b87a..1e5038d9a01b 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -90,8 +90,8 @@ ext3_acl_to_disk(const struct posix_acl *acl, size_t *size)
 	size_t n;
 
 	*size = ext3_acl_size(acl->a_count);
-	ext_acl = (ext3_acl_header *)kmalloc(sizeof(ext3_acl_header) +
-		acl->a_count * sizeof(ext3_acl_entry), GFP_KERNEL);
+	ext_acl = kmalloc(sizeof(ext3_acl_header) + acl->a_count *
+			sizeof(ext3_acl_entry), GFP_KERNEL);
 	if (!ext_acl)
 		return ERR_PTR(-ENOMEM);
 	ext_acl->a_version = cpu_to_le32(EXT3_ACL_VERSION);
@@ -258,7 +258,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 		default:
 			return -EINVAL;
 	}
- 	if (acl) {
+	if (acl) {
 		value = ext3_acl_to_disk(acl, &size);
 		if (IS_ERR(value))
 			return (int)PTR_ERR(value);
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 063d994bda0b..b41a7d7e20f0 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -38,6 +38,13 @@
 
 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
 
+/**
+ * ext3_get_group_desc() -- load group descriptor from disk
+ * @sb:			super block
+ * @block_group:	given block group
+ * @bh:			pointer to the buffer head to store the block
+ *			group descriptor
+ */
 struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
 					     unsigned int block_group,
 					     struct buffer_head ** bh)
@@ -73,8 +80,12 @@ struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
 	return desc + offset;
 }
 
-/*
- * Read the bitmap for a given block_group, reading into the specified 
+/**
+ * read_block_bitmap()
+ * @sb:			super block
+ * @block_group:	given block group
+ *
+ * Read the bitmap for a given block_group, reading into the specified
  * slot in the superblock's bitmap cache.
  *
  * Return buffer_head on success or NULL in case of failure.
@@ -103,15 +114,22 @@ error_out:
  * Operations include:
  * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
  *
- * We use sorted double linked list for the per-filesystem reservation
- * window list. (like in vm_region).
+ * We use a red-black tree to represent per-filesystem reservation
+ * windows.
+ *
+ */
+
+/**
+ * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
+ * @rb_root:		root of per-filesystem reservation rb tree
+ * @verbose:		verbose mode
+ * @fn:			function which wishes to dump the reservation map
  *
- * Initially, we keep those small operations in the abstract functions,
- * so later if we need a better searching tree than double linked-list,
- * we could easily switch to that without changing too much
- * code.
+ * If verbose is turned on, it will print the whole block reservation
+ * windows(start, end).	Otherwise, it will only print out the "bad" windows,
+ * those windows that overlap with their immediate neighbors.
  */
-#if 0
+#if 1
 static void __rsv_window_dump(struct rb_root *root, int verbose,
 			      const char *fn)
 {
@@ -129,7 +147,7 @@ restart:
 		rsv = list_entry(n, struct ext3_reserve_window_node, rsv_node);
 		if (verbose)
 			printk("reservation window 0x%p "
-			       "start:  %d, end:  %d\n",
+			       "start:  %lu, end:  %lu\n",
 			       rsv, rsv->rsv_start, rsv->rsv_end);
 		if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
 			printk("Bad reservation %p (start >= end)\n",
@@ -161,6 +179,22 @@ restart:
 #define rsv_window_dump(root, verbose) do {} while (0)
 #endif
 
+/**
+ * goal_in_my_reservation()
+ * @rsv:		inode's reservation window
+ * @grp_goal:		given goal block relative to the allocation block group
+ * @group:		the current allocation block group
+ * @sb:			filesystem super block
+ *
+ * Test if the given goal block (group relative) is within the file's
+ * own block reservation window range.
+ *
+ * If the reservation window is outside the goal allocation group, return 0;
+ * grp_goal (given goal block) could be -1, which means no specific
+ * goal block. In this case, always return 1.
+ * If the goal block is within the reservation window, return 1;
+ * otherwise, return 0;
+ */
 static int
 goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
 			unsigned int group, struct super_block * sb)
@@ -168,7 +202,7 @@ goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
 	ext3_fsblk_t group_first_block, group_last_block;
 
 	group_first_block = ext3_group_first_block_no(sb, group);
-	group_last_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
+	group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
 
 	if ((rsv->_rsv_start > group_last_block) ||
 	    (rsv->_rsv_end < group_first_block))
@@ -179,7 +213,11 @@ goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
 	return 1;
 }
 
-/*
+/**
+ * search_reserve_window()
+ * @rb_root:		root of reservation tree
+ * @goal:		target allocation block
+ *
  * Find the reserved window which includes the goal, or the previous one
  * if the goal is not in any window.
  * Returns NULL if there are no windows or if all windows start after the goal.
@@ -216,6 +254,13 @@ search_reserve_window(struct rb_root *root, ext3_fsblk_t goal)
 	return rsv;
 }
 
+/**
+ * ext3_rsv_window_add() -- Insert a window to the block reservation rb tree.
+ * @sb:			super block
+ * @rsv:		reservation window to add
+ *
+ * Must be called with rsv_lock hold.
+ */
 void ext3_rsv_window_add(struct super_block *sb,
 		    struct ext3_reserve_window_node *rsv)
 {
@@ -236,14 +281,25 @@ void ext3_rsv_window_add(struct super_block *sb,
 			p = &(*p)->rb_left;
 		else if (start > this->rsv_end)
 			p = &(*p)->rb_right;
-		else
+		else {
+			rsv_window_dump(root, 1);
 			BUG();
+		}
 	}
 
 	rb_link_node(node, parent, p);
 	rb_insert_color(node, root);
 }
 
+/**
+ * ext3_rsv_window_remove() -- unlink a window from the reservation rb tree
+ * @sb:			super block
+ * @rsv:		reservation window to remove
+ *
+ * Mark the block reservation window as not allocated, and unlink it
+ * from the filesystem reservation window rb tree. Must be called with
+ * rsv_lock hold.
+ */
 static void rsv_window_remove(struct super_block *sb,
 			      struct ext3_reserve_window_node *rsv)
 {
@@ -253,11 +309,39 @@ static void rsv_window_remove(struct super_block *sb,
 	rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root);
 }
 
+/*
+ * rsv_is_empty() -- Check if the reservation window is allocated.
+ * @rsv:		given reservation window to check
+ *
+ * returns 1 if the end block is EXT3_RESERVE_WINDOW_NOT_ALLOCATED.
+ */
 static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
 {
 	/* a valid reservation end block could not be 0 */
-	return (rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED);
+	return rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
 }
+
+/**
+ * ext3_init_block_alloc_info()
+ * @inode:		file inode structure
+ *
+ * Allocate and initialize the	reservation window structure, and
+ * link the window to the ext3 inode structure at last
+ *
+ * The reservation window structure is only dynamically allocated
+ * and linked to ext3 inode the first time the open file
+ * needs a new block. So, before every ext3_new_block(s) call, for
+ * regular files, we should check whether the reservation window
+ * structure exists or not. In the latter case, this function is called.
+ * Fail to do so will result in block reservation being turned off for that
+ * open file.
+ *
+ * This function is called from ext3_get_blocks_handle(), also called
+ * when setting the reservation window size through ioctl before the file
+ * is open for write (needs block allocation).
+ *
+ * Needs truncate_mutex protection prior to call this function.
+ */
 void ext3_init_block_alloc_info(struct inode *inode)
 {
 	struct ext3_inode_info *ei = EXT3_I(inode);
@@ -271,7 +355,7 @@ void ext3_init_block_alloc_info(struct inode *inode)
 		rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
 		rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
 
-	 	/*
+		/*
 		 * if filesystem is mounted with NORESERVATION, the goal
 		 * reservation window size is set to zero to indicate
 		 * block reservation is off
@@ -287,6 +371,19 @@ void ext3_init_block_alloc_info(struct inode *inode)
 	ei->i_block_alloc_info = block_i;
 }
 
+/**
+ * ext3_discard_reservation()
+ * @inode:		inode
+ *
+ * Discard(free) block reservation window on last file close, or truncate
+ * or at last iput().
+ *
+ * It is being called in three cases:
+ *	ext3_release_file(): last writer close the file
+ *	ext3_clear_inode(): last iput(), when nobody link to this file.
+ *	ext3_truncate(): when the block indirect map is about to change.
+ *
+ */
 void ext3_discard_reservation(struct inode *inode)
 {
 	struct ext3_inode_info *ei = EXT3_I(inode);
@@ -306,7 +403,14 @@ void ext3_discard_reservation(struct inode *inode)
 	}
 }
 
-/* Free given blocks, update quota and i_blocks field */
+/**
+ * ext3_free_blocks_sb() -- Free given blocks and update quota
+ * @handle:			handle to this transaction
+ * @sb:				super block
+ * @block:			start physcial block to free
+ * @count:			number of blocks to free
+ * @pdquot_freed_blocks:	pointer to quota
+ */
 void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
 			 ext3_fsblk_t block, unsigned long count,
 			 unsigned long *pdquot_freed_blocks)
@@ -419,8 +523,8 @@ do_more:
 		}
 		/* @@@ This prevents newly-allocated data from being
 		 * freed and then reallocated within the same
-		 * transaction. 
-		 * 
+		 * transaction.
+		 *
 		 * Ideally we would want to allow that to happen, but to
 		 * do so requires making journal_forget() capable of
 		 * revoking the queued write of a data block, which
@@ -433,7 +537,7 @@ do_more:
 		 * safe not to set the allocation bit in the committed
 		 * bitmap, because we know that there is no outstanding
 		 * activity on the buffer any more and so it is safe to
-		 * reallocate it.  
+		 * reallocate it.
 		 */
 		BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
 		J_ASSERT_BH(bitmap_bh,
@@ -490,7 +594,13 @@ error_return:
 	return;
 }
 
-/* Free given blocks, update quota and i_blocks field */
+/**
+ * ext3_free_blocks() -- Free given blocks and update quota
+ * @handle:		handle for this transaction
+ * @inode:		inode
+ * @block:		start physical block to free
+ * @count:		number of blocks to count
+ */
 void ext3_free_blocks(handle_t *handle, struct inode *inode,
 			ext3_fsblk_t block, unsigned long count)
 {
@@ -508,7 +618,11 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
 	return;
 }
 
-/*
+/**
+ * ext3_test_allocatable()
+ * @nr:			given allocation block group
+ * @bh:			bufferhead contains the bitmap of the given block group
+ *
  * For ext3 allocations, we must not reuse any blocks which are
  * allocated in the bitmap buffer's "last committed data" copy.  This
  * prevents deletes from freeing up the page for reuse until we have
@@ -518,7 +632,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
  * data would allow the old block to be overwritten before the
  * transaction committed (because we force data to disk before commit).
  * This would lead to corruption if we crashed between overwriting the
- * data and committing the delete. 
+ * data and committing the delete.
  *
  * @@@ We may want to make this allocation behaviour conditional on
  * data-writes at some point, and disable it for metadata allocations or
@@ -541,6 +655,16 @@ static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh)
 	return ret;
 }
 
+/**
+ * bitmap_search_next_usable_block()
+ * @start:		the starting block (group relative) of the search
+ * @bh:			bufferhead contains the block group bitmap
+ * @maxblocks:		the ending block (group relative) of the reservation
+ *
+ * The bitmap search --- search forward alternately through the actual
+ * bitmap on disk and the last-committed copy in journal, until we find a
+ * bit free in both bitmaps.
+ */
 static ext3_grpblk_t
 bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
 					ext3_grpblk_t maxblocks)
@@ -548,11 +672,6 @@ bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
 	ext3_grpblk_t next;
 	struct journal_head *jh = bh2jh(bh);
 
-	/*
-	 * The bitmap search --- search forward alternately through the actual
-	 * bitmap and the last-committed copy until we find a bit free in
-	 * both
-	 */
 	while (start < maxblocks) {
 		next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start);
 		if (next >= maxblocks)
@@ -562,14 +681,20 @@ bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
 		jbd_lock_bh_state(bh);
 		if (jh->b_committed_data)
 			start = ext3_find_next_zero_bit(jh->b_committed_data,
-						 	maxblocks, next);
+							maxblocks, next);
 		jbd_unlock_bh_state(bh);
 	}
 	return -1;
 }
 
-/*
- * Find an allocatable block in a bitmap.  We honour both the bitmap and
+/**
+ * find_next_usable_block()
+ * @start:		the starting block (group relative) to find next
+ *			allocatable block in bitmap.
+ * @bh:			bufferhead contains the block group bitmap
+ * @maxblocks:		the ending block (group relative) for the search
+ *
+ * Find an allocatable block in a bitmap.  We honor both the bitmap and
  * its last-committed copy (if that exists), and perform the "most
  * appropriate allocation" algorithm of looking for a free block near
  * the initial goal; then for a free byte somewhere in the bitmap; then
@@ -584,7 +709,7 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
 
 	if (start > 0) {
 		/*
-		 * The goal was occupied; search forward for a free 
+		 * The goal was occupied; search forward for a free
 		 * block within the next XX blocks.
 		 *
 		 * end_goal is more or less random, but it has to be
@@ -620,7 +745,11 @@ find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
 	return here;
 }
 
-/*
+/**
+ * claim_block()
+ * @block:		the free block (group relative) to allocate
+ * @bh:			the bufferhead containts the block group bitmap
+ *
  * We think we can allocate this block in this bitmap.  Try to set the bit.
  * If that succeeds then check that nobody has allocated and then freed the
  * block since we saw that is was not marked in b_committed_data.  If it _was_
@@ -646,7 +775,26 @@ claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh)
 	return ret;
 }
 
-/*
+/**
+ * ext3_try_to_allocate()
+ * @sb:			superblock
+ * @handle:		handle to this transaction
+ * @group:		given allocation block group
+ * @bitmap_bh:		bufferhead holds the block bitmap
+ * @grp_goal:		given target block within the group
+ * @count:		target number of blocks to allocate
+ * @my_rsv:		reservation window
+ *
+ * Attempt to allocate blocks within a give range. Set the range of allocation
+ * first, then find the first free bit(s) from the bitmap (within the range),
+ * and at last, allocate the blocks by claiming the found free bit as allocated.
+ *
+ * To set the range of this allocation:
+ *	if there is a reservation window, only try to allocate block(s) from the
+ *	file's own reservation window;
+ *	Otherwise, the allocation range starts from the give goal block, ends at
+ *	the block group's last block.
+ *
  * If we failed to allocate the desired block then we may end up crossing to a
  * new bitmap.  In that case we must release write access to the old one via
  * ext3_journal_release_buffer(), else we'll run out of credits.
@@ -703,7 +851,8 @@ repeat:
 	}
 	start = grp_goal;
 
-	if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) {
+	if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group),
+		grp_goal, bitmap_bh)) {
 		/*
 		 * The block was allocated by another thread, or it was
 		 * allocated and then freed by another thread
@@ -718,7 +867,8 @@ repeat:
 	grp_goal++;
 	while (num < *count && grp_goal < end
 		&& ext3_test_allocatable(grp_goal, bitmap_bh)
-		&& claim_block(sb_bgl_lock(EXT3_SB(sb), group), grp_goal, bitmap_bh)) {
+		&& claim_block(sb_bgl_lock(EXT3_SB(sb), group),
+				grp_goal, bitmap_bh)) {
 		num++;
 		grp_goal++;
 	}
@@ -730,12 +880,12 @@ fail_access:
 }
 
 /**
- * 	find_next_reservable_window():
+ *	find_next_reservable_window():
  *		find a reservable space within the given range.
  *		It does not allocate the reservation window for now:
  *		alloc_new_reservation() will do the work later.
  *
- * 	@search_head: the head of the searching list;
+ *	@search_head: the head of the searching list;
  *		This is not necessarily the list head of the whole filesystem
  *
  *		We have both head and start_block to assist the search
@@ -743,12 +893,12 @@ fail_access:
  *		but we will shift to the place where start_block is,
  *		then start from there, when looking for a reservable space.
  *
- * 	@size: the target new reservation window size
+ *	@size: the target new reservation window size
  *
- * 	@group_first_block: the first block we consider to start
+ *	@group_first_block: the first block we consider to start
  *			the real search from
  *
- * 	@last_block:
+ *	@last_block:
  *		the maximum block number that our goal reservable space
  *		could start from. This is normally the last block in this
  *		group. The search will end when we found the start of next
@@ -756,10 +906,10 @@ fail_access:
  *		This could handle the cross boundary reservation window
  *		request.
  *
- * 	basically we search from the given range, rather than the whole
- * 	reservation double linked list, (start_block, last_block)
- * 	to find a free region that is of my size and has not
- * 	been reserved.
+ *	basically we search from the given range, rather than the whole
+ *	reservation double linked list, (start_block, last_block)
+ *	to find a free region that is of my size and has not
+ *	been reserved.
  *
  */
 static int find_next_reservable_window(
@@ -812,7 +962,7 @@ static int find_next_reservable_window(
 			/*
 			 * Found a reserveable space big enough.  We could
 			 * have a reservation across the group boundary here
-		 	 */
+			 */
 			break;
 		}
 	}
@@ -848,7 +998,7 @@ static int find_next_reservable_window(
 }
 
 /**
- * 	alloc_new_reservation()--allocate a new reservation window
+ *	alloc_new_reservation()--allocate a new reservation window
  *
  *		To make a new reservation, we search part of the filesystem
  *		reservation list (the list that inside the group). We try to
@@ -897,7 +1047,7 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
 	spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
 
 	group_first_block = ext3_group_first_block_no(sb, group);
-	group_end_block = group_first_block + EXT3_BLOCKS_PER_GROUP(sb) - 1;
+	group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
 
 	if (grp_goal < 0)
 		start_block = group_first_block;
@@ -929,9 +1079,10 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
 		if ((my_rsv->rsv_alloc_hit >
 		     (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
 			/*
-			 * if we previously allocation hit ration is greater than half
-			 * we double the size of reservation window next time
-			 * otherwise keep the same
+			 * if the previously allocation hit ratio is
+			 * greater than 1/2, then we double the size of
+			 * the reservation window the next time,
+			 * otherwise we keep the same size window
 			 */
 			size = size * 2;
 			if (size > EXT3_MAX_RESERVE_BLOCKS)
@@ -1010,6 +1161,23 @@ retry:
 	goto retry;
 }
 
+/**
+ * try_to_extend_reservation()
+ * @my_rsv:		given reservation window
+ * @sb:			super block
+ * @size:		the delta to extend
+ *
+ * Attempt to expand the reservation window large enough to have
+ * required number of free blocks
+ *
+ * Since ext3_try_to_allocate() will always allocate blocks within
+ * the reservation window range, if the window size is too small,
+ * multiple blocks allocation has to stop at the end of the reservation
+ * window. To make this more efficient, given the total number of
+ * blocks needed and the current size of the window, we try to
+ * expand the reservation window size if necessary on a best-effort
+ * basis before ext3_new_blocks() tries to allocate blocks,
+ */
 static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
 			struct super_block *sb, int size)
 {
@@ -1035,7 +1203,17 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
 	spin_unlock(rsv_lock);
 }
 
-/*
+/**
+ * ext3_try_to_allocate_with_rsv()
+ * @sb:			superblock
+ * @handle:		handle to this transaction
+ * @group:		given allocation block group
+ * @bitmap_bh:		bufferhead holds the block bitmap
+ * @grp_goal:		given target block within the group
+ * @count:		target number of blocks to allocate
+ * @my_rsv:		reservation window
+ * @errp:		pointer to store the error code
+ *
  * This is the main function used to allocate a new block and its reservation
  * window.
  *
@@ -1051,9 +1229,7 @@ static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
  * reservation), and there are lots of free blocks, but they are all
  * being reserved.
  *
- * We use a sorted double linked list for the per-filesystem reservation list.
- * The insert, remove and find a free space(non-reserved) operations for the
- * sorted double linked list should be fast.
+ * We use a red-black tree for the per-filesystem reservation list.
  *
  */
 static ext3_grpblk_t
@@ -1063,7 +1239,7 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 			struct ext3_reserve_window_node * my_rsv,
 			unsigned long *count, int *errp)
 {
-	ext3_fsblk_t group_first_block;
+	ext3_fsblk_t group_first_block, group_last_block;
 	ext3_grpblk_t ret = 0;
 	int fatal;
 	unsigned long num = *count;
@@ -1100,6 +1276,7 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 	 * first block is the block number of the first block in this group
 	 */
 	group_first_block = ext3_group_first_block_no(sb, group);
+	group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
 
 	/*
 	 * Basically we will allocate a new block from inode's reservation
@@ -1118,7 +1295,8 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 	 */
 	while (1) {
 		if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
-			!goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb)) {
+			!goal_in_my_reservation(&my_rsv->rsv_window,
+						grp_goal, group, sb)) {
 			if (my_rsv->rsv_goal_size < *count)
 				my_rsv->rsv_goal_size = *count;
 			ret = alloc_new_reservation(my_rsv, grp_goal, sb,
@@ -1126,17 +1304,21 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
 			if (ret < 0)
 				break;			/* failed */
 
-			if (!goal_in_my_reservation(&my_rsv->rsv_window, grp_goal, group, sb))
+			if (!goal_in_my_reservation(&my_rsv->rsv_window,
+							grp_goal, group, sb))
 				grp_goal = -1;
-		} else if (grp_goal > 0 && (my_rsv->rsv_end-grp_goal+1) < *count)
+		} else if (grp_goal > 0 &&
+			  (my_rsv->rsv_end-grp_goal+1) < *count)
 			try_to_extend_reservation(my_rsv, sb,
 					*count-my_rsv->rsv_end + grp_goal - 1);
 
-		if ((my_rsv->rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb))
-		    || (my_rsv->rsv_end < group_first_block))
+		if ((my_rsv->rsv_start > group_last_block) ||
+				(my_rsv->rsv_end < group_first_block)) {
+			rsv_window_dump(&EXT3_SB(sb)->s_rsv_window_root, 1);
 			BUG();
-		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, grp_goal,
-					   &num, &my_rsv->rsv_window);
+		}
+		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
+					   grp_goal, &num, &my_rsv->rsv_window);
 		if (ret >= 0) {
 			my_rsv->rsv_alloc_hit += num;
 			*count = num;
@@ -1161,6 +1343,12 @@ out:
 	return ret;
 }
 
+/**
+ * ext3_has_free_blocks()
+ * @sbi:		in-core super block structure.
+ *
+ * Check if filesystem has at least 1 free block available for allocation.
+ */
 static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
 {
 	ext3_fsblk_t free_blocks, root_blocks;
@@ -1175,11 +1363,17 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
 	return 1;
 }
 
-/*
+/**
+ * ext3_should_retry_alloc()
+ * @sb:			super block
+ * @retries		number of attemps has been made
+ *
  * ext3_should_retry_alloc() is called when ENOSPC is returned, and if
  * it is profitable to retry the operation, this function will wait
  * for the current or commiting transaction to complete, and then
  * return TRUE.
+ *
+ * if the total number of retries exceed three times, return FALSE.
  */
 int ext3_should_retry_alloc(struct super_block *sb, int *retries)
 {
@@ -1191,13 +1385,19 @@ int ext3_should_retry_alloc(struct super_block *sb, int *retries)
 	return journal_force_commit_nested(EXT3_SB(sb)->s_journal);
 }
 
-/*
- * ext3_new_block uses a goal block to assist allocation.  If the goal is
- * free, or there is a free block within 32 blocks of the goal, that block
- * is allocated.  Otherwise a forward search is made for a free block; within 
- * each block group the search first looks for an entire free byte in the block
- * bitmap, and then for any free bit if that fails.
- * This function also updates quota and i_blocks field.
+/**
+ * ext3_new_blocks() -- core block(s) allocation function
+ * @handle:		handle to this transaction
+ * @inode:		file inode
+ * @goal:		given target block(filesystem wide)
+ * @count:		target number of blocks to allocate
+ * @errp:		error code
+ *
+ * ext3_new_blocks uses a goal block to assist allocation.  It tries to
+ * allocate block(s) from the block group contains the goal block first. If that
+ * fails, it will try to allocate block(s) from other block groups without
+ * any specific goal block.
+ *
  */
 ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
 			ext3_fsblk_t goal, unsigned long *count, int *errp)
@@ -1303,7 +1503,7 @@ retry_alloc:
 	smp_rmb();
 
 	/*
-	 * Now search the rest of the groups.  We assume that 
+	 * Now search the rest of the groups.  We assume that
 	 * i and gdp correctly point to the last group visited.
 	 */
 	for (bgi = 0; bgi < ngroups; bgi++) {
@@ -1428,7 +1628,7 @@ allocated:
 
 	spin_lock(sb_bgl_lock(sbi, group_no));
 	gdp->bg_free_blocks_count =
-			cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - num);
+			cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
 	percpu_counter_mod(&sbi->s_freeblocks_counter, -num);
 
@@ -1471,6 +1671,12 @@ ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
 	return ext3_new_blocks(handle, inode, goal, &count, errp);
 }
 
+/**
+ * ext3_count_free_blocks() -- count filesystem free blocks
+ * @sb:		superblock
+ *
+ * Adds up the number of free blocks from each block group.
+ */
 ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
 {
 	ext3_fsblk_t desc_count;
diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c
index ce4f82b9e528..b9176eed98d1 100644
--- a/fs/ext3/bitmap.c
+++ b/fs/ext3/bitmap.c
@@ -20,7 +20,7 @@ unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
 	unsigned int i;
 	unsigned long sum = 0;
 
-	if (!map) 
+	if (!map)
 		return (0);
 	for (i = 0; i < numchars; i++)
 		sum += nibblemap[map->b_data[i] & 0xf] +
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index fbb0d4ed07d4..d0b54f30b914 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -44,6 +44,9 @@ const struct file_operations ext3_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= ext3_readdir,		/* we take BKL. needed?*/
 	.ioctl		= ext3_ioctl,		/* BKL held */
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext3_compat_ioctl,
+#endif
 	.fsync		= ext3_sync_file,	/* BKL held */
 #ifdef CONFIG_EXT3_INDEX
 	.release	= ext3_release_dir,
@@ -59,7 +62,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
 
 	return (ext3_filetype_table[filetype]);
 }
-			       
+
 
 int ext3_check_dir_entry (const char * function, struct inode * dir,
 			  struct ext3_dir_entry_2 * de,
@@ -67,7 +70,7 @@ int ext3_check_dir_entry (const char * function, struct inode * dir,
 			  unsigned long offset)
 {
 	const char * error_msg = NULL;
- 	const int rlen = le16_to_cpu(de->rec_len);
+	const int rlen = le16_to_cpu(de->rec_len);
 
 	if (rlen < EXT3_DIR_REC_LEN(1))
 		error_msg = "rec_len is smaller than minimal";
@@ -162,7 +165,7 @@ revalidate:
 		 * to make sure. */
 		if (filp->f_version != inode->i_version) {
 			for (i = 0; i < sb->s_blocksize && i < offset; ) {
-				de = (struct ext3_dir_entry_2 *) 
+				de = (struct ext3_dir_entry_2 *)
 					(bh->b_data + i);
 				/* It's too expensive to do a full
 				 * dirent test each time round this
@@ -181,7 +184,7 @@ revalidate:
 			filp->f_version = inode->i_version;
 		}
 
-		while (!error && filp->f_pos < inode->i_size 
+		while (!error && filp->f_pos < inode->i_size
 		       && offset < sb->s_blocksize) {
 			de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
 			if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
@@ -229,7 +232,7 @@ out:
 /*
  * These functions convert from the major/minor hash to an f_pos
  * value.
- * 
+ *
  * Currently we only use major hash numer.  This is unfortunate, but
  * on 32-bit machines, the same VFS interface is used for lseek and
  * llseek, so if we use the 64 bit offset, then the 32-bit versions of
@@ -250,7 +253,7 @@ out:
 struct fname {
 	__u32		hash;
 	__u32		minor_hash;
-	struct rb_node	rb_hash; 
+	struct rb_node	rb_hash;
 	struct fname	*next;
 	__u32		inode;
 	__u8		name_len;
@@ -343,10 +346,9 @@ int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
 
 	/* Create and allocate the fname structure */
 	len = sizeof(struct fname) + dirent->name_len + 1;
-	new_fn = kmalloc(len, GFP_KERNEL);
+	new_fn = kzalloc(len, GFP_KERNEL);
 	if (!new_fn)
 		return -ENOMEM;
-	memset(new_fn, 0, len);
 	new_fn->hash = hash;
 	new_fn->minor_hash = minor_hash;
 	new_fn->inode = le32_to_cpu(dirent->inode);
@@ -410,7 +412,7 @@ static int call_filldir(struct file * filp, void * dirent,
 	curr_pos = hash2pos(fname->hash, fname->minor_hash);
 	while (fname) {
 		error = filldir(dirent, fname->name,
-				fname->name_len, curr_pos, 
+				fname->name_len, curr_pos,
 				fname->inode,
 				get_dtype(sb, fname->file_type));
 		if (error) {
@@ -465,7 +467,7 @@ static int ext3_dx_readdir(struct file * filp,
 		/*
 		 * Fill the rbtree if we have no more entries,
 		 * or the inode has changed since we last read in the
-		 * cached entries. 
+		 * cached entries.
 		 */
 		if ((!info->curr_node) ||
 		    (filp->f_version != inode->i_version)) {
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 1efefb630ea9..e96c388047e0 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -48,14 +48,15 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
 }
 
 static ssize_t
-ext3_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
+ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_dentry->d_inode;
 	ssize_t ret;
 	int err;
 
-	ret = generic_file_aio_write(iocb, buf, count, pos);
+	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
 
 	/*
 	 * Skip flushing if there was an error, or if nothing was written.
@@ -100,7 +101,7 @@ ext3_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t
 
 force_commit:
 	err = ext3_force_commit(inode->i_sb);
-	if (err) 
+	if (err)
 		return err;
 	return ret;
 }
@@ -111,9 +112,10 @@ const struct file_operations ext3_file_operations = {
 	.write		= do_sync_write,
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= ext3_file_write,
-	.readv		= generic_file_readv,
-	.writev		= generic_file_writev,
 	.ioctl		= ext3_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext3_compat_ioctl,
+#endif
 	.mmap		= generic_file_mmap,
 	.open		= generic_file_open,
 	.release	= ext3_release_file,
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 49382a208e05..dd1fd3c0fc05 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -8,14 +8,14 @@
  *                      Universite Pierre et Marie Curie (Paris VI)
  *  from
  *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
- * 
+ *
  *  ext3fs fsync primitive
  *
  *  Big-endian to little-endian byte-swapping/bitmaps by
  *        David S. Miller (davem@caip.rutgers.edu), 1995
- * 
+ *
  *  Removed unnecessary code duplication for little endian machines
- *  and excessive __inline__s. 
+ *  and excessive __inline__s.
  *        Andi Kleen, 1997
  *
  * Major simplications and cleanup - we only need to do the metadata, because
diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
index 5a2d1235ead0..deeb27b5ba83 100644
--- a/fs/ext3/hash.c
+++ b/fs/ext3/hash.c
@@ -4,7 +4,7 @@
  * Copyright (C) 2002 by Theodore Ts'o
  *
  * This file is released under the GPL v2.
- * 
+ *
  * This file may be redistributed under the terms of the GNU Public
  * License.
  */
@@ -80,11 +80,11 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
  * Returns the hash of a filename.  If len is 0 and name is NULL, then
  * this function can be used to test whether or not a hash version is
  * supported.
- * 
+ *
  * The seed is an 4 longword (32 bits) "secret" which can be used to
  * uniquify a hash.  If the seed is all zero's, then some default seed
  * may be used.
- * 
+ *
  * A particular hash version specifies whether or not the seed is
  * represented, and whether or not the returned hash is 32 bits or 64
  * bits.  32 bit hashes will return 0 for the minor hash.
@@ -95,7 +95,7 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 	__u32	minor_hash = 0;
 	const char	*p;
 	int		i;
-	__u32 		in[8], buf[4];
+	__u32		in[8], buf[4];
 
 	/* Initialize the default seed for the hash checksum functions */
 	buf[0] = 0x67452301;
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 36546ed36a14..e45dbd651736 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -202,7 +202,7 @@ error_return:
 static int find_group_dir(struct super_block *sb, struct inode *parent)
 {
 	int ngroups = EXT3_SB(sb)->s_groups_count;
-	int freei, avefreei;
+	unsigned int freei, avefreei;
 	struct ext3_group_desc *desc, *best_desc = NULL;
 	struct buffer_head *bh;
 	int group, best_group = -1;
@@ -216,7 +216,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
 			continue;
 		if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
 			continue;
-		if (!best_desc || 
+		if (!best_desc ||
 		    (le16_to_cpu(desc->bg_free_blocks_count) >
 		     le16_to_cpu(best_desc->bg_free_blocks_count))) {
 			best_group = group;
@@ -226,30 +226,30 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
 	return best_group;
 }
 
-/* 
- * Orlov's allocator for directories. 
- * 
+/*
+ * Orlov's allocator for directories.
+ *
  * We always try to spread first-level directories.
  *
- * If there are blockgroups with both free inodes and free blocks counts 
- * not worse than average we return one with smallest directory count. 
- * Otherwise we simply return a random group. 
- * 
- * For the rest rules look so: 
- * 
- * It's OK to put directory into a group unless 
- * it has too many directories already (max_dirs) or 
- * it has too few free inodes left (min_inodes) or 
- * it has too few free blocks left (min_blocks) or 
- * it's already running too large debt (max_debt). 
- * Parent's group is prefered, if it doesn't satisfy these 
- * conditions we search cyclically through the rest. If none 
- * of the groups look good we just look for a group with more 
- * free inodes than average (starting at parent's group). 
- * 
- * Debt is incremented each time we allocate a directory and decremented 
- * when we allocate an inode, within 0--255. 
- */ 
+ * If there are blockgroups with both free inodes and free blocks counts
+ * not worse than average we return one with smallest directory count.
+ * Otherwise we simply return a random group.
+ *
+ * For the rest rules look so:
+ *
+ * It's OK to put directory into a group unless
+ * it has too many directories already (max_dirs) or
+ * it has too few free inodes left (min_inodes) or
+ * it has too few free blocks left (min_blocks) or
+ * it's already running too large debt (max_debt).
+ * Parent's group is prefered, if it doesn't satisfy these
+ * conditions we search cyclically through the rest. If none
+ * of the groups look good we just look for a group with more
+ * free inodes than average (starting at parent's group).
+ *
+ * Debt is incremented each time we allocate a directory and decremented
+ * when we allocate an inode, within 0--255.
+ */
 
 #define INODE_COST 64
 #define BLOCK_COST 256
@@ -261,10 +261,10 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
 	struct ext3_super_block *es = sbi->s_es;
 	int ngroups = sbi->s_groups_count;
 	int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
-	int freei, avefreei;
+	unsigned int freei, avefreei;
 	ext3_fsblk_t freeb, avefreeb;
 	ext3_fsblk_t blocks_per_dir;
-	int ndirs;
+	unsigned int ndirs;
 	int max_debt, max_dirs, min_inodes;
 	ext3_grpblk_t min_blocks;
 	int group = -1, i;
@@ -454,7 +454,7 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
 			group = find_group_dir(sb, dir);
 		else
 			group = find_group_orlov(sb, dir);
-	} else 
+	} else
 		group = find_group_other(sb, dir);
 
 	err = -ENOSPC;
@@ -559,7 +559,6 @@ got:
 
 	inode->i_ino = ino;
 	/* This is the optimal IO size (for stat), not the fs block size */
-	inode->i_blksize = PAGE_SIZE;
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 84be02e93652..03ba5bcab186 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -13,11 +13,11 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
  *  Goal-directed block allocation by Stephen Tweedie
- * 	(sct@redhat.com), 1993, 1998
+ *	(sct@redhat.com), 1993, 1998
  *  Big-endian to little-endian byte-swapping/bitmaps by
  *        David S. Miller (davem@caip.rutgers.edu), 1995
  *  64-bit file support on 64-bit platforms by Jakub Jelinek
- * 	(jj@sunsite.ms.mff.cuni.cz)
+ *	(jj@sunsite.ms.mff.cuni.cz)
  *
  *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
  */
@@ -36,6 +36,7 @@
 #include <linux/writeback.h>
 #include <linux/mpage.h>
 #include <linux/uio.h>
+#include <linux/bio.h>
 #include "xattr.h"
 #include "acl.h"
 
@@ -55,7 +56,7 @@ static int ext3_inode_is_fast_symlink(struct inode *inode)
 /*
  * The ext3 forget function must perform a revoke if we are freeing data
  * which has been journaled.  Metadata (eg. indirect blocks) must be
- * revoked in all cases. 
+ * revoked in all cases.
  *
  * "bh" may be NULL: a metadata block may have been freed from memory
  * but there may still be a record of it in the journal, and that record
@@ -105,7 +106,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
  * Work out how many blocks we need to proceed with the next chunk of a
  * truncate transaction.
  */
-static unsigned long blocks_for_truncate(struct inode *inode) 
+static unsigned long blocks_for_truncate(struct inode *inode)
 {
 	unsigned long needed;
 
@@ -122,13 +123,13 @@ static unsigned long blocks_for_truncate(struct inode *inode)
 
 	/* But we need to bound the transaction so we don't overflow the
 	 * journal. */
-	if (needed > EXT3_MAX_TRANS_DATA) 
+	if (needed > EXT3_MAX_TRANS_DATA)
 		needed = EXT3_MAX_TRANS_DATA;
 
 	return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
 }
 
-/* 
+/*
  * Truncate transactions can be complex and absolutely huge.  So we need to
  * be able to restart the transaction at a conventient checkpoint to make
  * sure we don't overflow the journal.
@@ -136,9 +137,9 @@ static unsigned long blocks_for_truncate(struct inode *inode)
  * start_transaction gets us a new handle for a truncate transaction,
  * and extend_transaction tries to extend the existing one a bit.  If
  * extend fails, we need to propagate the failure up and restart the
- * transaction in the top-level truncate loop. --sct 
+ * transaction in the top-level truncate loop. --sct
  */
-static handle_t *start_transaction(struct inode *inode) 
+static handle_t *start_transaction(struct inode *inode)
 {
 	handle_t *result;
 
@@ -215,12 +216,12 @@ void ext3_delete_inode (struct inode * inode)
 	ext3_orphan_del(handle, inode);
 	EXT3_I(inode)->i_dtime	= get_seconds();
 
-	/* 
+	/*
 	 * One subtle ordering requirement: if anything has gone wrong
 	 * (transaction abort, IO errors, whatever), then we can still
 	 * do these next steps (the fs will already have been marked as
 	 * having errors), but we can't free the inode if the mark_dirty
-	 * fails.  
+	 * fails.
 	 */
 	if (ext3_mark_inode_dirty(handle, inode))
 		/* If that failed, just do the required in-core inode clear. */
@@ -398,7 +399,7 @@ no_block:
  *	  + if there is a block to the left of our position - allocate near it.
  *	  + if pointer will live in indirect block - allocate near that block.
  *	  + if pointer will live in inode - allocate in the same
- *	    cylinder group. 
+ *	    cylinder group.
  *
  * In the latter case we colour the starting block by the callers PID to
  * prevent it from clashing with concurrent allocations for a different inode
@@ -470,7 +471,7 @@ static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
  *	ext3_blks_to_allocate: Look up the block map and count the number
  *	of direct blocks need to be allocated for the given branch.
  *
- * 	@branch: chain of indirect blocks
+ *	@branch: chain of indirect blocks
  *	@k: number of blocks need for indirect blocks
  *	@blks: number of data blocks to be mapped.
  *	@blocks_to_boundary:  the offset in the indirect block
@@ -744,7 +745,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
 		jbd_debug(5, "splicing indirect only\n");
 		BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
 		err = ext3_journal_dirty_metadata(handle, where->bh);
-		if (err) 
+		if (err)
 			goto err_out;
 	} else {
 		/*
@@ -1073,7 +1074,7 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
 		return bh;
 	if (buffer_uptodate(bh))
 		return bh;
-	ll_rw_block(READ, 1, &bh);
+	ll_rw_block(READ_META, 1, &bh);
 	wait_on_buffer(bh);
 	if (buffer_uptodate(bh))
 		return bh;
@@ -1098,7 +1099,7 @@ static int walk_page_buffers(	handle_t *handle,
 
 	for (	bh = head, block_start = 0;
 		ret == 0 && (bh != head || !block_start);
-	    	block_start = block_end, bh = next)
+		block_start = block_end, bh = next)
 	{
 		next = bh->b_this_page;
 		block_end = block_start + blocksize;
@@ -1137,7 +1138,7 @@ static int walk_page_buffers(	handle_t *handle,
  * So what we do is to rely on the fact that journal_stop/journal_start
  * will _not_ run commit under these circumstances because handle->h_ref
  * is elevated.  We'll still have enough credits for the tiny quotafile
- * write.  
+ * write.
  */
 static int do_journal_get_write_access(handle_t *handle,
 					struct buffer_head *bh)
@@ -1282,7 +1283,7 @@ static int ext3_journalled_commit_write(struct file *file,
 	if (inode->i_size > EXT3_I(inode)->i_disksize) {
 		EXT3_I(inode)->i_disksize = inode->i_size;
 		ret2 = ext3_mark_inode_dirty(handle, inode);
-		if (!ret) 
+		if (!ret)
 			ret = ret2;
 	}
 	ret2 = ext3_journal_stop(handle);
@@ -1291,7 +1292,7 @@ static int ext3_journalled_commit_write(struct file *file,
 	return ret;
 }
 
-/* 
+/*
  * bmap() is special.  It gets used by applications such as lilo and by
  * the swapper to find the on-disk block of a specific piece of data.
  *
@@ -1300,10 +1301,10 @@ static int ext3_journalled_commit_write(struct file *file,
  * filesystem and enables swap, then they may get a nasty shock when the
  * data getting swapped to that swapfile suddenly gets overwritten by
  * the original zero's written out previously to the journal and
- * awaiting writeback in the kernel's buffer cache. 
+ * awaiting writeback in the kernel's buffer cache.
  *
  * So, if we see any bmap calls here on a modified, data-journaled file,
- * take extra steps to flush any blocks which might be in the cache. 
+ * take extra steps to flush any blocks which might be in the cache.
  */
 static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
 {
@@ -1312,16 +1313,16 @@ static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
 	int err;
 
 	if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
-		/* 
+		/*
 		 * This is a REALLY heavyweight approach, but the use of
 		 * bmap on dirty files is expected to be extremely rare:
 		 * only if we run lilo or swapon on a freshly made file
-		 * do we expect this to happen. 
+		 * do we expect this to happen.
 		 *
 		 * (bmap requires CAP_SYS_RAWIO so this does not
 		 * represent an unprivileged user DOS attack --- we'd be
 		 * in trouble if mortal users could trigger this path at
-		 * will.) 
+		 * will.)
 		 *
 		 * NB. EXT3_STATE_JDATA is not set on files other than
 		 * regular files.  If somebody wants to bmap a directory
@@ -1457,7 +1458,7 @@ static int ext3_ordered_writepage(struct page *page,
 	 */
 
 	/*
-	 * And attach them to the current transaction.  But only if 
+	 * And attach them to the current transaction.  But only if
 	 * block_write_full_page() succeeded.  Otherwise they are unmapped,
 	 * and generally junk.
 	 */
@@ -1644,7 +1645,7 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
 		}
 	}
 
-	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
 				 ext3_get_block, NULL);
 
@@ -2025,7 +2026,7 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
 			   __le32 *first, __le32 *last)
 {
 	ext3_fsblk_t block_to_free = 0;    /* Starting block # of a run */
-	unsigned long count = 0;	    /* Number of blocks in the run */ 
+	unsigned long count = 0;	    /* Number of blocks in the run */
 	__le32 *block_to_free_p = NULL;	    /* Pointer into inode/ind
 					       corresponding to
 					       block_to_free */
@@ -2054,7 +2055,7 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
 			} else if (nr == block_to_free + count) {
 				count++;
 			} else {
-				ext3_clear_blocks(handle, inode, this_bh, 
+				ext3_clear_blocks(handle, inode, this_bh,
 						  block_to_free,
 						  count, block_to_free_p, p);
 				block_to_free = nr;
@@ -2115,7 +2116,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
 			 */
 			if (!bh) {
 				ext3_error(inode->i_sb, "ext3_free_branches",
-					   "Read failure, inode=%ld, block="E3FSBLK,
+					   "Read failure, inode=%lu, block="E3FSBLK,
 					   inode->i_ino, nr);
 				continue;
 			}
@@ -2184,7 +2185,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
 					*p = 0;
 					BUFFER_TRACE(parent_bh,
 					"call ext3_journal_dirty_metadata");
-					ext3_journal_dirty_metadata(handle, 
+					ext3_journal_dirty_metadata(handle,
 								    parent_bh);
 				}
 			}
@@ -2540,7 +2541,7 @@ make_io:
 		 */
 		get_bh(bh);
 		bh->b_end_io = end_buffer_read_sync;
-		submit_bh(READ, bh);
+		submit_bh(READ_META, bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			ext3_error(inode->i_sb, "ext3_get_inode_loc",
@@ -2632,9 +2633,6 @@ void ext3_read_inode(struct inode * inode)
 		 * recovery code: that's fine, we're about to complete
 		 * the process of deleting those. */
 	}
-	inode->i_blksize = PAGE_SIZE;	/* This is the optimal IO size
-					 * (for stat), not the fs block
-					 * size */  
 	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
 	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
 #ifdef EXT3_FRAGMENTS
@@ -2704,7 +2702,7 @@ void ext3_read_inode(struct inode * inode)
 		if (raw_inode->i_block[0])
 			init_special_inode(inode, inode->i_mode,
 			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
-		else 
+		else
 			init_special_inode(inode, inode->i_mode,
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
 	}
@@ -2724,8 +2722,8 @@ bad_inode:
  *
  * The caller must have write access to iloc->bh.
  */
-static int ext3_do_update_inode(handle_t *handle, 
-				struct inode *inode, 
+static int ext3_do_update_inode(handle_t *handle,
+				struct inode *inode,
 				struct ext3_iloc *iloc)
 {
 	struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
@@ -2900,7 +2898,7 @@ int ext3_write_inode(struct inode *inode, int wait)
  * commit will leave the blocks being flushed in an unused state on
  * disk.  (On recovery, the inode will get truncated and the blocks will
  * be freed, so we have a strong guarantee that no future commit will
- * leave these blocks visible to the user.)  
+ * leave these blocks visible to the user.)
  *
  * Called with inode->sem down.
  */
@@ -3043,13 +3041,13 @@ int ext3_mark_iloc_dirty(handle_t *handle,
 	return err;
 }
 
-/* 
+/*
  * On success, We end up with an outstanding reference count against
- * iloc->bh.  This _must_ be cleaned up later. 
+ * iloc->bh.  This _must_ be cleaned up later.
  */
 
 int
-ext3_reserve_inode_write(handle_t *handle, struct inode *inode, 
+ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
 			 struct ext3_iloc *iloc)
 {
 	int err = 0;
@@ -3139,7 +3137,7 @@ out:
 }
 
 #if 0
-/* 
+/*
  * Bind an inode's backing buffer_head into this transaction, to prevent
  * it from being flushed to disk early.  Unlike
  * ext3_reserve_inode_write, this leaves behind no bh reference and
@@ -3157,7 +3155,7 @@ static int ext3_pin_inode(handle_t *handle, struct inode *inode)
 			BUFFER_TRACE(iloc.bh, "get_write_access");
 			err = journal_get_write_access(handle, iloc.bh);
 			if (!err)
-				err = ext3_journal_dirty_metadata(handle, 
+				err = ext3_journal_dirty_metadata(handle,
 								  iloc.bh);
 			brelse(iloc.bh);
 		}
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 3a6b012d120c..12daa6869572 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -13,9 +13,10 @@
 #include <linux/ext3_fs.h>
 #include <linux/ext3_jbd.h>
 #include <linux/time.h>
+#include <linux/compat.h>
+#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 
-
 int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 		unsigned long arg)
 {
@@ -252,3 +253,55 @@ flags_err:
 		return -ENOTTY;
 	}
 }
+
+#ifdef CONFIG_COMPAT
+long ext3_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	int ret;
+
+	/* These are just misnamed, they actually get/put from/to user an int */
+	switch (cmd) {
+	case EXT3_IOC32_GETFLAGS:
+		cmd = EXT3_IOC_GETFLAGS;
+		break;
+	case EXT3_IOC32_SETFLAGS:
+		cmd = EXT3_IOC_SETFLAGS;
+		break;
+	case EXT3_IOC32_GETVERSION:
+		cmd = EXT3_IOC_GETVERSION;
+		break;
+	case EXT3_IOC32_SETVERSION:
+		cmd = EXT3_IOC_SETVERSION;
+		break;
+	case EXT3_IOC32_GROUP_EXTEND:
+		cmd = EXT3_IOC_GROUP_EXTEND;
+		break;
+	case EXT3_IOC32_GETVERSION_OLD:
+		cmd = EXT3_IOC_GETVERSION_OLD;
+		break;
+	case EXT3_IOC32_SETVERSION_OLD:
+		cmd = EXT3_IOC_SETVERSION_OLD;
+		break;
+#ifdef CONFIG_JBD_DEBUG
+	case EXT3_IOC32_WAIT_FOR_READONLY:
+		cmd = EXT3_IOC_WAIT_FOR_READONLY;
+		break;
+#endif
+	case EXT3_IOC32_GETRSVSZ:
+		cmd = EXT3_IOC_GETRSVSZ;
+		break;
+	case EXT3_IOC32_SETRSVSZ:
+		cmd = EXT3_IOC_SETRSVSZ;
+		break;
+	case EXT3_IOC_GROUP_ADD:
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	lock_kernel();
+	ret = ext3_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
+	unlock_kernel();
+	return ret;
+}
+#endif
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 2aa7101b27cd..906731a20f1a 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -15,13 +15,13 @@
  *  Big-endian to little-endian byte-swapping/bitmaps by
  *        David S. Miller (davem@caip.rutgers.edu), 1995
  *  Directory entry file type support and forward compatibility hooks
- *  	for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
+ *	for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
  *  Hash Tree Directory indexing (c)
- *  	Daniel Phillips, 2001
+ *	Daniel Phillips, 2001
  *  Hash Tree Directory indexing porting
- *  	Christopher Li, 2002
+ *	Christopher Li, 2002
  *  Hash Tree Directory indexing cleanup
- * 	Theodore Ts'o, 2002
+ *	Theodore Ts'o, 2002
  */
 
 #include <linux/fs.h>
@@ -35,6 +35,7 @@
 #include <linux/string.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include <linux/bio.h>
 #include <linux/smp_lock.h>
 
 #include "namei.h"
@@ -76,7 +77,7 @@ static struct buffer_head *ext3_append(handle_t *handle,
 #ifdef DX_DEBUG
 #define dxtrace(command) command
 #else
-#define dxtrace(command) 
+#define dxtrace(command)
 #endif
 
 struct fake_dirent
@@ -169,7 +170,7 @@ static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
 static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
 static int ext3_htree_next_block(struct inode *dir, __u32 hash,
 				 struct dx_frame *frame,
-				 struct dx_frame *frames, 
+				 struct dx_frame *frames,
 				 __u32 *start_hash);
 static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
 		       struct ext3_dir_entry_2 **res_dir, int *err);
@@ -250,7 +251,7 @@ static void dx_show_index (char * label, struct dx_entry *entries)
 }
 
 struct stats
-{ 
+{
 	unsigned names;
 	unsigned space;
 	unsigned bcount;
@@ -278,7 +279,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_ent
 				       ((char *) de - base));
 			}
 			space += EXT3_DIR_REC_LEN(de->name_len);
-	 		names++;
+			names++;
 		}
 		de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
 	}
@@ -464,7 +465,7 @@ static void dx_release (struct dx_frame *frames)
  */
 static int ext3_htree_next_block(struct inode *dir, __u32 hash,
 				 struct dx_frame *frame,
-				 struct dx_frame *frames, 
+				 struct dx_frame *frames,
 				 __u32 *start_hash)
 {
 	struct dx_frame *p;
@@ -632,7 +633,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 		}
 		count += ret;
 		hashval = ~0;
-		ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, 
+		ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
 					    frame, frames, &hashval);
 		*next_hash = hashval;
 		if (ret < 0) {
@@ -649,7 +650,7 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 			break;
 	}
 	dx_release(frames);
-	dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", 
+	dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
 		       count, *next_hash));
 	return count;
 errout:
@@ -870,7 +871,7 @@ restart:
 				bh = ext3_getblk(NULL, dir, b++, 0, &err);
 				bh_use[ra_max] = bh;
 				if (bh)
-					ll_rw_block(READ, 1, &bh);
+					ll_rw_block(READ_META, 1, &bh);
 			}
 		}
 		if ((bh = bh_use[ra_ptr++]) == NULL)
@@ -1050,7 +1051,7 @@ struct dentry *ext3_get_parent(struct dentry *child)
 		parent = ERR_PTR(-ENOMEM);
 	}
 	return parent;
-} 
+}
 
 #define S_SHIFT 12
 static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -1198,7 +1199,7 @@ errout:
  * add_dirent_to_buf will attempt search the directory block for
  * space.  It will return -ENOSPC if no space is available, and -EIO
  * and -EEXIST if directory entry already exists.
- * 
+ *
  * NOTE!  bh is NOT released in the case where ENOSPC is returned.  In
  * all other cases bh is released.
  */
@@ -1572,7 +1573,7 @@ cleanup:
  * ext3_delete_entry deletes a directory entry by merging it with the
  * previous entry
  */
-static int ext3_delete_entry (handle_t *handle, 
+static int ext3_delete_entry (handle_t *handle,
 			      struct inode * dir,
 			      struct ext3_dir_entry_2 * de_del,
 			      struct buffer_head * bh)
@@ -1615,12 +1616,12 @@ static int ext3_delete_entry (handle_t *handle,
  */
 static inline void ext3_inc_count(handle_t *handle, struct inode *inode)
 {
-	inode->i_nlink++;
+	inc_nlink(inode);
 }
 
 static inline void ext3_dec_count(handle_t *handle, struct inode *inode)
 {
-	inode->i_nlink--;
+	drop_nlink(inode);
 }
 
 static int ext3_add_nondir(handle_t *handle,
@@ -1643,12 +1644,12 @@ static int ext3_add_nondir(handle_t *handle,
  * is so far negative - it has no inode.
  *
  * If the create succeeds, we fill in the inode information
- * with d_instantiate(). 
+ * with d_instantiate().
  */
 static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
 		struct nameidata *nd)
 {
-	handle_t *handle; 
+	handle_t *handle;
 	struct inode * inode;
 	int err, retries = 0;
 
@@ -1688,7 +1689,7 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry,
 
 retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
-			 		EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
 					2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -1742,7 +1743,7 @@ retry:
 	inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
 	dir_block = ext3_bread (handle, inode, 0, 1, &err);
 	if (!dir_block) {
-		inode->i_nlink--; /* is this nlink == 0? */
+		drop_nlink(inode); /* is this nlink == 0? */
 		ext3_mark_inode_dirty(handle, inode);
 		iput (inode);
 		goto out_stop;
@@ -1774,7 +1775,7 @@ retry:
 		iput (inode);
 		goto out_stop;
 	}
-	dir->i_nlink++;
+	inc_nlink(dir);
 	ext3_update_dx_flag(dir);
 	ext3_mark_inode_dirty(handle, dir);
 	d_instantiate(dentry, inode);
@@ -1813,10 +1814,10 @@ static int empty_dir (struct inode * inode)
 	de1 = (struct ext3_dir_entry_2 *)
 			((char *) de + le16_to_cpu(de->rec_len));
 	if (le32_to_cpu(de->inode) != inode->i_ino ||
-			!le32_to_cpu(de1->inode) || 
+			!le32_to_cpu(de1->inode) ||
 			strcmp (".", de->name) ||
 			strcmp ("..", de1->name)) {
-	    	ext3_warning (inode->i_sb, "empty_dir",
+		ext3_warning (inode->i_sb, "empty_dir",
 			      "bad directory (dir #%lu) - no `.' or `..'",
 			      inode->i_ino);
 		brelse (bh);
@@ -1883,7 +1884,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode)
 	 * being truncated, or files being unlinked. */
 
 	/* @@@ FIXME: Observation from aviro:
-	 * I think I can trigger J_ASSERT in ext3_orphan_add().  We block 
+	 * I think I can trigger J_ASSERT in ext3_orphan_add().  We block
 	 * here (on lock_super()), so race with ext3_link() which might bump
 	 * ->i_nlink. For, say it, character device. Not a regular file,
 	 * not a directory, not a symlink and ->i_nlink > 0.
@@ -1919,8 +1920,8 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode)
 	if (!err)
 		list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
 
-	jbd_debug(4, "superblock will point to %ld\n", inode->i_ino);
-	jbd_debug(4, "orphan inode %ld will point to %d\n",
+	jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
+	jbd_debug(4, "orphan inode %lu will point to %d\n",
 			inode->i_ino, NEXT_ORPHAN(inode));
 out_unlock:
 	unlock_super(sb);
@@ -2044,7 +2045,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
 			      "empty directory has nlink!=2 (%d)",
 			      inode->i_nlink);
 	inode->i_version++;
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	/* There's no need to set i_disksize: the fact that i_nlink is
 	 * zero will ensure that the right thing happens during any
 	 * recovery. */
@@ -2052,7 +2053,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
 	ext3_orphan_add(handle, inode);
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
 	ext3_mark_inode_dirty(handle, inode);
-	dir->i_nlink--;
+	drop_nlink(dir);
 	ext3_update_dx_flag(dir);
 	ext3_mark_inode_dirty(handle, dir);
 
@@ -2103,7 +2104,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
 	ext3_update_dx_flag(dir);
 	ext3_mark_inode_dirty(handle, dir);
-	inode->i_nlink--;
+	drop_nlink(inode);
 	if (!inode->i_nlink)
 		ext3_orphan_add(handle, inode);
 	inode->i_ctime = dir->i_ctime;
@@ -2129,7 +2130,7 @@ static int ext3_symlink (struct inode * dir,
 
 retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
-			 		EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
+					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
 					2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -2227,7 +2228,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 		DQUOT_INIT(new_dentry->d_inode);
 	handle = ext3_journal_start(old_dir, 2 *
 					EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
-			 		EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
+					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
@@ -2325,7 +2326,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 	}
 
 	if (new_inode) {
-		new_inode->i_nlink--;
+		drop_nlink(new_inode);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 	}
 	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
@@ -2336,11 +2337,11 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 		PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
 		BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
 		ext3_journal_dirty_metadata(handle, dir_bh);
-		old_dir->i_nlink--;
+		drop_nlink(old_dir);
 		if (new_inode) {
-			new_inode->i_nlink--;
+			drop_nlink(new_inode);
 		} else {
-			new_dir->i_nlink++;
+			inc_nlink(new_dir);
 			ext3_update_dx_flag(new_dir);
 			ext3_mark_inode_dirty(handle, new_dir);
 		}
@@ -2393,4 +2394,4 @@ struct inode_operations ext3_special_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.permission	= ext3_permission,
-}; 
+};
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 5e1337fd878a..b73cba12f79c 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -336,7 +336,7 @@ static int verify_reserved_gdb(struct super_block *sb,
 	unsigned five = 5;
 	unsigned seven = 7;
 	unsigned grp;
-	__u32 *p = (__u32 *)primary->b_data;
+	__le32 *p = (__le32 *)primary->b_data;
 	int gdbackups = 0;
 
 	while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) {
@@ -380,7 +380,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	struct buffer_head *dind;
 	int gdbackups;
 	struct ext3_iloc iloc;
-	__u32 *data;
+	__le32 *data;
 	int err;
 
 	if (test_opt(sb, DEBUG))
@@ -417,7 +417,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 		goto exit_bh;
 	}
 
-	data = (__u32 *)dind->b_data;
+	data = (__le32 *)dind->b_data;
 	if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) {
 		ext3_warning(sb, __FUNCTION__,
 			     "new group %u GDT block "E3FSBLK" not reserved",
@@ -439,8 +439,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	if ((err = ext3_reserve_inode_write(handle, inode, &iloc)))
 		goto exit_dindj;
 
-	n_group_desc = (struct buffer_head **)kmalloc((gdb_num + 1) *
-				sizeof(struct buffer_head *), GFP_KERNEL);
+	n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
+			GFP_KERNEL);
 	if (!n_group_desc) {
 		err = -ENOMEM;
 		ext3_warning (sb, __FUNCTION__,
@@ -519,7 +519,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	struct buffer_head *dind;
 	struct ext3_iloc iloc;
 	ext3_fsblk_t blk;
-	__u32 *data, *end;
+	__le32 *data, *end;
 	int gdbackups = 0;
 	int res, i;
 	int err;
@@ -536,8 +536,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	}
 
 	blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count;
-	data = (__u32 *)dind->b_data + EXT3_SB(sb)->s_gdb_count;
-	end = (__u32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb);
+	data = (__le32 *)dind->b_data + EXT3_SB(sb)->s_gdb_count;
+	end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb);
 
 	/* Get each reserved primary GDT block and verify it holds backups */
 	for (res = 0; res < reserved_gdb; res++, blk++) {
@@ -545,7 +545,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 			ext3_warning(sb, __FUNCTION__,
 				     "reserved block "E3FSBLK
 				     " not at offset %ld",
-				     blk, (long)(data - (__u32 *)dind->b_data));
+				     blk,
+				     (long)(data - (__le32 *)dind->b_data));
 			err = -EINVAL;
 			goto exit_bh;
 		}
@@ -560,7 +561,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 			goto exit_bh;
 		}
 		if (++data >= end)
-			data = (__u32 *)dind->b_data;
+			data = (__le32 *)dind->b_data;
 	}
 
 	for (i = 0; i < reserved_gdb; i++) {
@@ -584,7 +585,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	blk = input->group * EXT3_BLOCKS_PER_GROUP(sb);
 	for (i = 0; i < reserved_gdb; i++) {
 		int err2;
-		data = (__u32 *)primary[i]->b_data;
+		data = (__le32 *)primary[i]->b_data;
 		/* printk("reserving backup %lu[%u] = %lu\n",
 		       primary[i]->b_blocknr, gdbackups,
 		       blk + primary[i]->b_blocknr); */
@@ -689,7 +690,7 @@ exit_err:
 			     "can't update backup for group %d (err %d), "
 			     "forcing fsck on next reboot", group, err);
 		sbi->s_mount_state &= ~EXT3_VALID_FS;
-		sbi->s_es->s_state &= ~cpu_to_le16(EXT3_VALID_FS);
+		sbi->s_es->s_state &= cpu_to_le16(~EXT3_VALID_FS);
 		mark_buffer_dirty(sbi->s_sbh);
 	}
 }
@@ -730,6 +731,18 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 		return -EPERM;
 	}
 
+	if (le32_to_cpu(es->s_blocks_count) + input->blocks_count <
+	    le32_to_cpu(es->s_blocks_count)) {
+		ext3_warning(sb, __FUNCTION__, "blocks_count overflow\n");
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(es->s_inodes_count) + EXT3_INODES_PER_GROUP(sb) <
+	    le32_to_cpu(es->s_inodes_count)) {
+		ext3_warning(sb, __FUNCTION__, "inodes_count overflow\n");
+		return -EINVAL;
+	}
+
 	if (reserved_gdb || gdb_off == 0) {
 		if (!EXT3_HAS_COMPAT_FEATURE(sb,
 					     EXT3_FEATURE_COMPAT_RESIZE_INODE)){
@@ -958,6 +971,11 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 
 	add = EXT3_BLOCKS_PER_GROUP(sb) - last;
 
+	if (o_blocks_count + add < o_blocks_count) {
+		ext3_warning(sb, __FUNCTION__, "blocks_count overflow");
+		return -EINVAL;
+	}
+
 	if (o_blocks_count + add > n_blocks_count)
 		add = n_blocks_count - o_blocks_count;
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3559086eee5f..afc2d4f42d77 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -45,7 +45,7 @@
 static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
 			     unsigned long journal_devnum);
 static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
-			       int);
+			       unsigned int);
 static void ext3_commit_super (struct super_block * sb,
 			       struct ext3_super_block * es,
 			       int sync);
@@ -62,13 +62,13 @@ static void ext3_unlockfs(struct super_block *sb);
 static void ext3_write_super (struct super_block * sb);
 static void ext3_write_super_lockfs(struct super_block *sb);
 
-/* 
+/*
  * Wrappers for journal_start/end.
  *
  * The only special thing we need to do here is to make sure that all
  * journal_end calls result in the superblock being marked dirty, so
  * that sync() will call the filesystem's write_super callback if
- * appropriate. 
+ * appropriate.
  */
 handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
 {
@@ -90,11 +90,11 @@ handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
 	return journal_start(journal, nblocks);
 }
 
-/* 
+/*
  * The only special thing we need to do here is to make sure that all
  * journal_stop calls result in the superblock being marked dirty, so
  * that sync() will call the filesystem's write_super callback if
- * appropriate. 
+ * appropriate.
  */
 int __ext3_journal_stop(const char *where, handle_t *handle)
 {
@@ -159,20 +159,21 @@ static void ext3_handle_error(struct super_block *sb)
 	if (sb->s_flags & MS_RDONLY)
 		return;
 
-	if (test_opt (sb, ERRORS_RO)) {
-		printk (KERN_CRIT "Remounting filesystem read-only\n");
-		sb->s_flags |= MS_RDONLY;
-	} else {
+	if (!test_opt (sb, ERRORS_CONT)) {
 		journal_t *journal = EXT3_SB(sb)->s_journal;
 
 		EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
 		if (journal)
 			journal_abort(journal, -EIO);
 	}
+	if (test_opt (sb, ERRORS_RO)) {
+		printk (KERN_CRIT "Remounting filesystem read-only\n");
+		sb->s_flags |= MS_RDONLY;
+	}
+	ext3_commit_super(sb, es, 1);
 	if (test_opt(sb, ERRORS_PANIC))
 		panic("EXT3-fs (device %s): panic forced after error\n",
 			sb->s_id);
-	ext3_commit_super(sb, es, 1);
 }
 
 void ext3_error (struct super_block * sb, const char * function,
@@ -369,16 +370,16 @@ static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
 {
 	struct list_head *l;
 
-	printk(KERN_ERR "sb orphan head is %d\n", 
+	printk(KERN_ERR "sb orphan head is %d\n",
 	       le32_to_cpu(sbi->s_es->s_last_orphan));
 
 	printk(KERN_ERR "sb_info orphan list:\n");
 	list_for_each(l, &sbi->s_orphan) {
 		struct inode *inode = orphan_list_entry(l);
 		printk(KERN_ERR "  "
-		       "inode %s:%ld at %p: mode %o, nlink %d, next %d\n",
+		       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
 		       inode->i_sb->s_id, inode->i_ino, inode,
-		       inode->i_mode, inode->i_nlink, 
+		       inode->i_mode, inode->i_nlink,
 		       NEXT_ORPHAN(inode));
 	}
 }
@@ -475,7 +476,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 		inode_init_once(&ei->vfs_inode);
 	}
 }
- 
+
 static int init_inodecache(void)
 {
 	ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
@@ -490,8 +491,7 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(ext3_inode_cachep))
-		printk(KERN_INFO "ext3_inode_cache: not all structures were freed\n");
+	kmem_cache_destroy(ext3_inode_cachep);
 }
 
 static void ext3_clear_inode(struct inode *inode)
@@ -733,8 +733,8 @@ static match_table_t tokens = {
 
 static ext3_fsblk_t get_sb_block(void **data)
 {
-	ext3_fsblk_t 	sb_block;
-	char 		*options = (char *) *data;
+	ext3_fsblk_t	sb_block;
+	char		*options = (char *) *data;
 
 	if (!options || strncmp(options, "sb=", 3) != 0)
 		return 1;	/* Default location */
@@ -753,7 +753,7 @@ static ext3_fsblk_t get_sb_block(void **data)
 }
 
 static int parse_options (char *options, struct super_block *sb,
-			  unsigned long *inum, unsigned long *journal_devnum,
+			  unsigned int *inum, unsigned long *journal_devnum,
 			  ext3_fsblk_t *n_blocks_count, int is_remount)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
@@ -1174,7 +1174,8 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
 static int ext3_check_descriptors (struct super_block * sb)
 {
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	ext3_fsblk_t block = le32_to_cpu(sbi->s_es->s_first_data_block);
+	ext3_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
+	ext3_fsblk_t last_block;
 	struct ext3_group_desc * gdp = NULL;
 	int desc_block = 0;
 	int i;
@@ -1183,12 +1184,17 @@ static int ext3_check_descriptors (struct super_block * sb)
 
 	for (i = 0; i < sbi->s_groups_count; i++)
 	{
+		if (i == sbi->s_groups_count - 1)
+			last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1;
+		else
+			last_block = first_block +
+				(EXT3_BLOCKS_PER_GROUP(sb) - 1);
+
 		if ((i % EXT3_DESC_PER_BLOCK(sb)) == 0)
 			gdp = (struct ext3_group_desc *)
 					sbi->s_group_desc[desc_block++]->b_data;
-		if (le32_to_cpu(gdp->bg_block_bitmap) < block ||
-		    le32_to_cpu(gdp->bg_block_bitmap) >=
-				block + EXT3_BLOCKS_PER_GROUP(sb))
+		if (le32_to_cpu(gdp->bg_block_bitmap) < first_block ||
+		    le32_to_cpu(gdp->bg_block_bitmap) > last_block)
 		{
 			ext3_error (sb, "ext3_check_descriptors",
 				    "Block bitmap for group %d"
@@ -1197,9 +1203,8 @@ static int ext3_check_descriptors (struct super_block * sb)
 					le32_to_cpu(gdp->bg_block_bitmap));
 			return 0;
 		}
-		if (le32_to_cpu(gdp->bg_inode_bitmap) < block ||
-		    le32_to_cpu(gdp->bg_inode_bitmap) >=
-				block + EXT3_BLOCKS_PER_GROUP(sb))
+		if (le32_to_cpu(gdp->bg_inode_bitmap) < first_block ||
+		    le32_to_cpu(gdp->bg_inode_bitmap) > last_block)
 		{
 			ext3_error (sb, "ext3_check_descriptors",
 				    "Inode bitmap for group %d"
@@ -1208,9 +1213,9 @@ static int ext3_check_descriptors (struct super_block * sb)
 					le32_to_cpu(gdp->bg_inode_bitmap));
 			return 0;
 		}
-		if (le32_to_cpu(gdp->bg_inode_table) < block ||
-		    le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >=
-		    block + EXT3_BLOCKS_PER_GROUP(sb))
+		if (le32_to_cpu(gdp->bg_inode_table) < first_block ||
+		    le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >
+		    last_block)
 		{
 			ext3_error (sb, "ext3_check_descriptors",
 				    "Inode table for group %d"
@@ -1219,7 +1224,7 @@ static int ext3_check_descriptors (struct super_block * sb)
 					le32_to_cpu(gdp->bg_inode_table));
 			return 0;
 		}
-		block += EXT3_BLOCKS_PER_GROUP(sb);
+		first_block += EXT3_BLOCKS_PER_GROUP(sb);
 		gdp++;
 	}
 
@@ -1301,17 +1306,17 @@ static void ext3_orphan_cleanup (struct super_block * sb,
 		DQUOT_INIT(inode);
 		if (inode->i_nlink) {
 			printk(KERN_DEBUG
-				"%s: truncating inode %ld to %Ld bytes\n",
+				"%s: truncating inode %lu to %Ld bytes\n",
 				__FUNCTION__, inode->i_ino, inode->i_size);
-			jbd_debug(2, "truncating inode %ld to %Ld bytes\n",
+			jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
 				  inode->i_ino, inode->i_size);
 			ext3_truncate(inode);
 			nr_truncates++;
 		} else {
 			printk(KERN_DEBUG
-				"%s: deleting unreferenced inode %ld\n",
+				"%s: deleting unreferenced inode %lu\n",
 				__FUNCTION__, inode->i_ino);
-			jbd_debug(2, "deleting unreferenced inode %ld\n",
+			jbd_debug(2, "deleting unreferenced inode %lu\n",
 				  inode->i_ino);
 			nr_orphans++;
 		}
@@ -1390,7 +1395,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	ext3_fsblk_t sb_block = get_sb_block(&data);
 	ext3_fsblk_t logic_sb_block;
 	unsigned long offset = 0;
-	unsigned long journal_inum = 0;
+	unsigned int journal_inum = 0;
 	unsigned long journal_devnum = 0;
 	unsigned long def_mount_opts;
 	struct inode *root;
@@ -1401,11 +1406,10 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	int needs_recovery;
 	__le32 features;
 
-	sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
 	sb->s_fs_info = sbi;
-	memset(sbi, 0, sizeof(*sbi));
 	sbi->s_mount_opt = 0;
 	sbi->s_resuid = EXT3_DEF_RESUID;
 	sbi->s_resgid = EXT3_DEF_RESGID;
@@ -1466,6 +1470,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		set_opt(sbi->s_mount_opt, ERRORS_PANIC);
 	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT3_ERRORS_RO)
 		set_opt(sbi->s_mount_opt, ERRORS_RO);
+	else
+		set_opt(sbi->s_mount_opt, ERRORS_CONT);
 
 	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
 	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -1483,7 +1489,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	    (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
 	     EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
 	     EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
-		printk(KERN_WARNING 
+		printk(KERN_WARNING
 		       "EXT3-fs warning: feature flags set on rev 0 fs, "
 		       "running e2fsck is recommended\n");
 	/*
@@ -1509,7 +1515,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 
 	if (blocksize < EXT3_MIN_BLOCK_SIZE ||
 	    blocksize > EXT3_MAX_BLOCK_SIZE) {
-		printk(KERN_ERR 
+		printk(KERN_ERR
 		       "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n",
 		       blocksize, sb->s_id);
 		goto failed_mount;
@@ -1533,14 +1539,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
 		bh = sb_bread(sb, logic_sb_block);
 		if (!bh) {
-			printk(KERN_ERR 
+			printk(KERN_ERR
 			       "EXT3-fs: Can't read superblock on 2nd try.\n");
 			goto failed_mount;
 		}
 		es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
 		sbi->s_es = es;
 		if (es->s_magic != cpu_to_le16(EXT3_SUPER_MAGIC)) {
-			printk (KERN_ERR 
+			printk (KERN_ERR
 				"EXT3-fs: Magic mismatch, very weird !\n");
 			goto failed_mount;
 		}
@@ -1622,10 +1628,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 
 	if (EXT3_BLOCKS_PER_GROUP(sb) == 0)
 		goto cantfind_ext3;
-	sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
-			       le32_to_cpu(es->s_first_data_block) +
-			       EXT3_BLOCKS_PER_GROUP(sb) - 1) /
-			      EXT3_BLOCKS_PER_GROUP(sb);
+	sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
+			       le32_to_cpu(es->s_first_data_block) - 1)
+				       / EXT3_BLOCKS_PER_GROUP(sb)) + 1;
 	db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) /
 		   EXT3_DESC_PER_BLOCK(sb);
 	sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
@@ -1820,7 +1825,7 @@ out_fail:
 /*
  * Setup any per-fs journal parameters now.  We'll do this both on
  * initial mount, once the journal has been initialised but before we've
- * done any recovery; and again on any subsequent remount. 
+ * done any recovery; and again on any subsequent remount.
  */
 static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
 {
@@ -1840,7 +1845,8 @@ static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
 	spin_unlock(&journal->j_state_lock);
 }
 
-static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum)
+static journal_t *ext3_get_journal(struct super_block *sb,
+				   unsigned int journal_inum)
 {
 	struct inode *journal_inode;
 	journal_t *journal;
@@ -1975,7 +1981,7 @@ static int ext3_load_journal(struct super_block *sb,
 			     unsigned long journal_devnum)
 {
 	journal_t *journal;
-	int journal_inum = le32_to_cpu(es->s_journal_inum);
+	unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
 	dev_t journal_dev;
 	int err = 0;
 	int really_read_only;
@@ -2061,7 +2067,7 @@ static int ext3_load_journal(struct super_block *sb,
 
 static int ext3_create_journal(struct super_block * sb,
 			       struct ext3_super_block * es,
-			       int journal_inum)
+			       unsigned int journal_inum)
 {
 	journal_t *journal;
 
@@ -2074,7 +2080,7 @@ static int ext3_create_journal(struct super_block * sb,
 	if (!(journal = ext3_get_journal(sb, journal_inum)))
 		return -EINVAL;
 
-	printk(KERN_INFO "EXT3-fs: creating new journal on inode %d\n",
+	printk(KERN_INFO "EXT3-fs: creating new journal on inode %u\n",
 	       journal_inum);
 
 	if (journal_create(journal)) {
@@ -2342,10 +2348,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 			 */
 			ext3_clear_journal_err(sb, es);
 			sbi->s_mount_state = le16_to_cpu(es->s_state);
-			if ((ret = ext3_group_extend(sb, es, n_blocks_count))) {
-				err = ret;
+			if ((err = ext3_group_extend(sb, es, n_blocks_count)))
 				goto restore_opts;
-			}
 			if (!ext3_setup_super (sb, es, 0))
 				sb->s_flags &= ~MS_RDONLY;
 		}
@@ -2734,7 +2738,7 @@ static int __init init_ext3_fs(void)
 out:
 	destroy_inodecache();
 out1:
- 	exit_ext3_xattr();
+	exit_ext3_xattr();
 	return err;
 }
 
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index a44a0562203a..f86f2482f01d 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -75,7 +75,7 @@
 
 #ifdef EXT3_XATTR_DEBUG
 # define ea_idebug(inode, f...) do { \
-		printk(KERN_DEBUG "inode %s:%ld: ", \
+		printk(KERN_DEBUG "inode %s:%lu: ", \
 			inode->i_sb->s_id, inode->i_ino); \
 		printk(f); \
 		printk("\n"); \
@@ -233,7 +233,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name,
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext3_xattr_check_block(bh)) {
 bad_block:	ext3_error(inode->i_sb, __FUNCTION__,
-			   "inode %ld: bad block "E3FSBLK, inode->i_ino,
+			   "inode %lu: bad block "E3FSBLK, inode->i_ino,
 			   EXT3_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
@@ -375,7 +375,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext3_xattr_check_block(bh)) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			   "inode %ld: bad block "E3FSBLK, inode->i_ino,
+			   "inode %lu: bad block "E3FSBLK, inode->i_ino,
 			   EXT3_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
@@ -647,7 +647,7 @@ ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i,
 			le32_to_cpu(BHDR(bs->bh)->h_refcount));
 		if (ext3_xattr_check_block(bs->bh)) {
 			ext3_error(sb, __FUNCTION__,
-				"inode %ld: bad block "E3FSBLK, inode->i_ino,
+				"inode %lu: bad block "E3FSBLK, inode->i_ino,
 				EXT3_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
@@ -848,7 +848,7 @@ cleanup_dquot:
 
 bad_block:
 	ext3_error(inode->i_sb, __FUNCTION__,
-		   "inode %ld: bad block "E3FSBLK, inode->i_ino,
+		   "inode %lu: bad block "E3FSBLK, inode->i_ino,
 		   EXT3_I(inode)->i_file_acl);
 	goto cleanup;
 
@@ -1077,14 +1077,14 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	if (!bh) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			"inode %ld: block "E3FSBLK" read error", inode->i_ino,
+			"inode %lu: block "E3FSBLK" read error", inode->i_ino,
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}
 	if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
 	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
 		ext3_error(inode->i_sb, __FUNCTION__,
-			"inode %ld: bad block "E3FSBLK, inode->i_ino,
+			"inode %lu: bad block "E3FSBLK, inode->i_ino,
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}
@@ -1211,7 +1211,7 @@ again:
 		bh = sb_bread(inode->i_sb, ce->e_block);
 		if (!bh) {
 			ext3_error(inode->i_sb, __FUNCTION__,
-				"inode %ld: block %lu read error",
+				"inode %lu: block %lu read error",
 				inode->i_ino, (unsigned long) ce->e_block);
 		} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
 				EXT3_XATTR_REFCOUNT_MAX) {
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
new file mode 100644
index 000000000000..a6acb96ebeb9
--- /dev/null
+++ b/fs/ext4/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for the linux ext4-filesystem routines.
+#
+
+obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
+
+ext4dev-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+	   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o
+
+ext4dev-$(CONFIG_EXT4DEV_FS_XATTR)	+= xattr.o xattr_user.o xattr_trusted.o
+ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL)	+= acl.o
+ext4dev-$(CONFIG_EXT4DEV_FS_SECURITY)	+= xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
new file mode 100644
index 000000000000..9e882546d91a
--- /dev/null
+++ b/fs/ext4/acl.c
@@ -0,0 +1,551 @@
+/*
+ * linux/fs/ext4/acl.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ */
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *
+ext4_acl_from_disk(const void *value, size_t size)
+{
+	const char *end = (char *)value + size;
+	int n, count;
+	struct posix_acl *acl;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(ext4_acl_header))
+		 return ERR_PTR(-EINVAL);
+	if (((ext4_acl_header *)value)->a_version !=
+	    cpu_to_le32(EXT4_ACL_VERSION))
+		return ERR_PTR(-EINVAL);
+	value = (char *)value + sizeof(ext4_acl_header);
+	count = ext4_acl_count(size);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	for (n=0; n < count; n++) {
+		ext4_acl_entry *entry =
+			(ext4_acl_entry *)value;
+		if ((char *)value + sizeof(ext4_acl_entry_short) > end)
+			goto fail;
+		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+		switch(acl->a_entries[n].e_tag) {
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				value = (char *)value +
+					sizeof(ext4_acl_entry_short);
+				acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
+				break;
+
+			case ACL_USER:
+			case ACL_GROUP:
+				value = (char *)value + sizeof(ext4_acl_entry);
+				if ((char *)value > end)
+					goto fail;
+				acl->a_entries[n].e_id =
+					le32_to_cpu(entry->e_id);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	if (value != end)
+		goto fail;
+	return acl;
+
+fail:
+	posix_acl_release(acl);
+	return ERR_PTR(-EINVAL);
+}
+
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static void *
+ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
+{
+	ext4_acl_header *ext_acl;
+	char *e;
+	size_t n;
+
+	*size = ext4_acl_size(acl->a_count);
+	ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count *
+			sizeof(ext4_acl_entry), GFP_KERNEL);
+	if (!ext_acl)
+		return ERR_PTR(-ENOMEM);
+	ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
+	e = (char *)ext_acl + sizeof(ext4_acl_header);
+	for (n=0; n < acl->a_count; n++) {
+		ext4_acl_entry *entry = (ext4_acl_entry *)e;
+		entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
+		entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+		switch(acl->a_entries[n].e_tag) {
+			case ACL_USER:
+			case ACL_GROUP:
+				entry->e_id =
+					cpu_to_le32(acl->a_entries[n].e_id);
+				e += sizeof(ext4_acl_entry);
+				break;
+
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_MASK:
+			case ACL_OTHER:
+				e += sizeof(ext4_acl_entry_short);
+				break;
+
+			default:
+				goto fail;
+		}
+	}
+	return (char *)ext_acl;
+
+fail:
+	kfree(ext_acl);
+	return ERR_PTR(-EINVAL);
+}
+
+static inline struct posix_acl *
+ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl)
+{
+	struct posix_acl *acl = EXT4_ACL_NOT_CACHED;
+
+	spin_lock(&inode->i_lock);
+	if (*i_acl != EXT4_ACL_NOT_CACHED)
+		acl = posix_acl_dup(*i_acl);
+	spin_unlock(&inode->i_lock);
+
+	return acl;
+}
+
+static inline void
+ext4_iset_acl(struct inode *inode, struct posix_acl **i_acl,
+		struct posix_acl *acl)
+{
+	spin_lock(&inode->i_lock);
+	if (*i_acl != EXT4_ACL_NOT_CACHED)
+		posix_acl_release(*i_acl);
+	*i_acl = posix_acl_dup(acl);
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Inode operation get_posix_acl().
+ *
+ * inode->i_mutex: don't care
+ */
+static struct posix_acl *
+ext4_get_acl(struct inode *inode, int type)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	int name_index;
+	char *value = NULL;
+	struct posix_acl *acl;
+	int retval;
+
+	if (!test_opt(inode->i_sb, POSIX_ACL))
+		return NULL;
+
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			acl = ext4_iget_acl(inode, &ei->i_acl);
+			if (acl != EXT4_ACL_NOT_CACHED)
+				return acl;
+			name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			acl = ext4_iget_acl(inode, &ei->i_default_acl);
+			if (acl != EXT4_ACL_NOT_CACHED)
+				return acl;
+			name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+			break;
+
+		default:
+			return ERR_PTR(-EINVAL);
+	}
+	retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
+	if (retval > 0) {
+		value = kmalloc(retval, GFP_KERNEL);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		retval = ext4_xattr_get(inode, name_index, "", value, retval);
+	}
+	if (retval > 0)
+		acl = ext4_acl_from_disk(value, retval);
+	else if (retval == -ENODATA || retval == -ENOSYS)
+		acl = NULL;
+	else
+		acl = ERR_PTR(retval);
+	kfree(value);
+
+	if (!IS_ERR(acl)) {
+		switch(type) {
+			case ACL_TYPE_ACCESS:
+				ext4_iset_acl(inode, &ei->i_acl, acl);
+				break;
+
+			case ACL_TYPE_DEFAULT:
+				ext4_iset_acl(inode, &ei->i_default_acl, acl);
+				break;
+		}
+	}
+	return acl;
+}
+
+/*
+ * Set the access or default ACL of an inode.
+ *
+ * inode->i_mutex: down unless called from ext4_new_inode
+ */
+static int
+ext4_set_acl(handle_t *handle, struct inode *inode, int type,
+	     struct posix_acl *acl)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	int name_index;
+	void *value = NULL;
+	size_t size = 0;
+	int error;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+			if (acl) {
+				mode_t mode = inode->i_mode;
+				error = posix_acl_equiv_mode(acl, &mode);
+				if (error < 0)
+					return error;
+				else {
+					inode->i_mode = mode;
+					ext4_mark_inode_dirty(handle, inode);
+					if (error == 0)
+						acl = NULL;
+				}
+			}
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+			if (!S_ISDIR(inode->i_mode))
+				return acl ? -EACCES : 0;
+			break;
+
+		default:
+			return -EINVAL;
+	}
+	if (acl) {
+		value = ext4_acl_to_disk(acl, &size);
+		if (IS_ERR(value))
+			return (int)PTR_ERR(value);
+	}
+
+	error = ext4_xattr_set_handle(handle, inode, name_index, "",
+				      value, size, 0);
+
+	kfree(value);
+	if (!error) {
+		switch(type) {
+			case ACL_TYPE_ACCESS:
+				ext4_iset_acl(inode, &ei->i_acl, acl);
+				break;
+
+			case ACL_TYPE_DEFAULT:
+				ext4_iset_acl(inode, &ei->i_default_acl, acl);
+				break;
+		}
+	}
+	return error;
+}
+
+static int
+ext4_check_acl(struct inode *inode, int mask)
+{
+	struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
+
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		int error = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return error;
+	}
+
+	return -EAGAIN;
+}
+
+int
+ext4_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	return generic_permission(inode, mask, ext4_check_acl);
+}
+
+/*
+ * Initialize the ACLs of a new inode. Called from ext4_new_inode.
+ *
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
+ */
+int
+ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+	struct posix_acl *acl = NULL;
+	int error = 0;
+
+	if (!S_ISLNK(inode->i_mode)) {
+		if (test_opt(dir->i_sb, POSIX_ACL)) {
+			acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT);
+			if (IS_ERR(acl))
+				return PTR_ERR(acl);
+		}
+		if (!acl)
+			inode->i_mode &= ~current->fs->umask;
+	}
+	if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
+		struct posix_acl *clone;
+		mode_t mode;
+
+		if (S_ISDIR(inode->i_mode)) {
+			error = ext4_set_acl(handle, inode,
+					     ACL_TYPE_DEFAULT, acl);
+			if (error)
+				goto cleanup;
+		}
+		clone = posix_acl_clone(acl, GFP_KERNEL);
+		error = -ENOMEM;
+		if (!clone)
+			goto cleanup;
+
+		mode = inode->i_mode;
+		error = posix_acl_create_masq(clone, &mode);
+		if (error >= 0) {
+			inode->i_mode = mode;
+			if (error > 0) {
+				/* This is an extended ACL */
+				error = ext4_set_acl(handle, inode,
+						     ACL_TYPE_ACCESS, clone);
+			}
+		}
+		posix_acl_release(clone);
+	}
+cleanup:
+	posix_acl_release(acl);
+	return error;
+}
+
+/*
+ * Does chmod for an inode that may have an Access Control List. The
+ * inode->i_mode field must be updated to the desired value by the caller
+ * before calling this function.
+ * Returns 0 on success, or a negative error number.
+ *
+ * We change the ACL rather than storing some ACL entries in the file
+ * mode permission bits (which would be more efficient), because that
+ * would break once additional permissions (like  ACL_APPEND, ACL_DELETE
+ * for directories) are added. There are no more bits available in the
+ * file mode.
+ *
+ * inode->i_mutex: down
+ */
+int
+ext4_acl_chmod(struct inode *inode)
+{
+	struct posix_acl *acl, *clone;
+	int error;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	if (!test_opt(inode->i_sb, POSIX_ACL))
+		return 0;
+	acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+	error = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!error) {
+		handle_t *handle;
+		int retries = 0;
+
+	retry:
+		handle = ext4_journal_start(inode,
+				EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+		if (IS_ERR(handle)) {
+			error = PTR_ERR(handle);
+			ext4_std_error(inode->i_sb, error);
+			goto out;
+		}
+		error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, clone);
+		ext4_journal_stop(handle);
+		if (error == -ENOSPC &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry;
+	}
+out:
+	posix_acl_release(clone);
+	return error;
+}
+
+/*
+ * Extended attribute handlers
+ */
+static size_t
+ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
+			   const char *name, size_t name_len)
+{
+	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+
+	if (!test_opt(inode->i_sb, POSIX_ACL))
+		return 0;
+	if (list && size <= list_len)
+		memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+	return size;
+}
+
+static size_t
+ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
+			    const char *name, size_t name_len)
+{
+	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
+
+	if (!test_opt(inode->i_sb, POSIX_ACL))
+		return 0;
+	if (list && size <= list_len)
+		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+	return size;
+}
+
+static int
+ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
+{
+	struct posix_acl *acl;
+	int error;
+
+	if (!test_opt(inode->i_sb, POSIX_ACL))
+		return -EOPNOTSUPP;
+
+	acl = ext4_get_acl(inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+	error = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+static int
+ext4_xattr_get_acl_access(struct inode *inode, const char *name,
+			  void *buffer, size_t size)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return ext4_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+
+static int
+ext4_xattr_get_acl_default(struct inode *inode, const char *name,
+			   void *buffer, size_t size)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return ext4_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+
+static int
+ext4_xattr_set_acl(struct inode *inode, int type, const void *value,
+		   size_t size)
+{
+	handle_t *handle;
+	struct posix_acl *acl;
+	int error, retries = 0;
+
+	if (!test_opt(inode->i_sb, POSIX_ACL))
+		return -EOPNOTSUPP;
+	if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+		return -EPERM;
+
+	if (value) {
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		else if (acl) {
+			error = posix_acl_valid(acl);
+			if (error)
+				goto release_and_out;
+		}
+	} else
+		acl = NULL;
+
+retry:
+	handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	error = ext4_set_acl(handle, inode, type, acl);
+	ext4_journal_stop(handle);
+	if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+
+release_and_out:
+	posix_acl_release(acl);
+	return error;
+}
+
+static int
+ext4_xattr_set_acl_access(struct inode *inode, const char *name,
+			  const void *value, size_t size, int flags)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return ext4_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+
+static int
+ext4_xattr_set_acl_default(struct inode *inode, const char *name,
+			   const void *value, size_t size, int flags)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return ext4_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+
+struct xattr_handler ext4_xattr_acl_access_handler = {
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.list	= ext4_xattr_list_acl_access,
+	.get	= ext4_xattr_get_acl_access,
+	.set	= ext4_xattr_set_acl_access,
+};
+
+struct xattr_handler ext4_xattr_acl_default_handler = {
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.list	= ext4_xattr_list_acl_default,
+	.get	= ext4_xattr_get_acl_default,
+	.set	= ext4_xattr_set_acl_default,
+};
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
new file mode 100644
index 000000000000..26a5c1abf147
--- /dev/null
+++ b/fs/ext4/acl.h
@@ -0,0 +1,81 @@
+/*
+  File: fs/ext4/acl.h
+
+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/posix_acl_xattr.h>
+
+#define EXT4_ACL_VERSION	0x0001
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+	__le32		e_id;
+} ext4_acl_entry;
+
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+} ext4_acl_entry_short;
+
+typedef struct {
+	__le32		a_version;
+} ext4_acl_header;
+
+static inline size_t ext4_acl_size(int count)
+{
+	if (count <= 4) {
+		return sizeof(ext4_acl_header) +
+		       count * sizeof(ext4_acl_entry_short);
+	} else {
+		return sizeof(ext4_acl_header) +
+		       4 * sizeof(ext4_acl_entry_short) +
+		       (count - 4) * sizeof(ext4_acl_entry);
+	}
+}
+
+static inline int ext4_acl_count(size_t size)
+{
+	ssize_t s;
+	size -= sizeof(ext4_acl_header);
+	s = size - 4 * sizeof(ext4_acl_entry_short);
+	if (s < 0) {
+		if (size % sizeof(ext4_acl_entry_short))
+			return -1;
+		return size / sizeof(ext4_acl_entry_short);
+	} else {
+		if (s % sizeof(ext4_acl_entry))
+			return -1;
+		return s / sizeof(ext4_acl_entry) + 4;
+	}
+}
+
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+
+/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
+   if the ACL has not been cached */
+#define EXT4_ACL_NOT_CACHED ((void *)-1)
+
+/* acl.c */
+extern int ext4_permission (struct inode *, int, struct nameidata *);
+extern int ext4_acl_chmod (struct inode *);
+extern int ext4_init_acl (handle_t *, struct inode *, struct inode *);
+
+#else  /* CONFIG_EXT4DEV_FS_POSIX_ACL */
+#include <linux/sched.h>
+#define ext4_permission NULL
+
+static inline int
+ext4_acl_chmod(struct inode *inode)
+{
+	return 0;
+}
+
+static inline int
+ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+	return 0;
+}
+#endif  /* CONFIG_EXT4DEV_FS_POSIX_ACL */
+
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
new file mode 100644
index 000000000000..5d45582f9517
--- /dev/null
+++ b/fs/ext4/balloc.c
@@ -0,0 +1,1833 @@
+/*
+ *  linux/fs/ext4/balloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/time.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+
+/*
+ * balloc.c contains the blocks allocation and deallocation routines
+ */
+
+/*
+ * Calculate the block group number and offset, given a block number
+ */
+void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
+		unsigned long *blockgrpp, ext4_grpblk_t *offsetp)
+{
+        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	ext4_grpblk_t offset;
+
+        blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
+	offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb));
+	if (offsetp)
+		*offsetp = offset;
+	if (blockgrpp)
+	        *blockgrpp = blocknr;
+
+}
+
+/*
+ * The free blocks are managed by bitmaps.  A file system contains several
+ * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block.  Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block.  The descriptors are loaded in memory
+ * when a file system is mounted (see ext4_read_super).
+ */
+
+
+#define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
+
+/**
+ * ext4_get_group_desc() -- load group descriptor from disk
+ * @sb:			super block
+ * @block_group:	given block group
+ * @bh:			pointer to the buffer head to store the block
+ *			group descriptor
+ */
+struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
+					     unsigned int block_group,
+					     struct buffer_head ** bh)
+{
+	unsigned long group_desc;
+	unsigned long offset;
+	struct ext4_group_desc * desc;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (block_group >= sbi->s_groups_count) {
+		ext4_error (sb, "ext4_get_group_desc",
+			    "block_group >= groups_count - "
+			    "block_group = %d, groups_count = %lu",
+			    block_group, sbi->s_groups_count);
+
+		return NULL;
+	}
+	smp_rmb();
+
+	group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
+	offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+	if (!sbi->s_group_desc[group_desc]) {
+		ext4_error (sb, "ext4_get_group_desc",
+			    "Group descriptor not loaded - "
+			    "block_group = %d, group_desc = %lu, desc = %lu",
+			     block_group, group_desc, offset);
+		return NULL;
+	}
+
+	desc = (struct ext4_group_desc *)(
+		(__u8 *)sbi->s_group_desc[group_desc]->b_data +
+		offset * EXT4_DESC_SIZE(sb));
+	if (bh)
+		*bh = sbi->s_group_desc[group_desc];
+	return desc;
+}
+
+/**
+ * read_block_bitmap()
+ * @sb:			super block
+ * @block_group:	given block group
+ *
+ * Read the bitmap for a given block_group, reading into the specified
+ * slot in the superblock's bitmap cache.
+ *
+ * Return buffer_head on success or NULL in case of failure.
+ */
+static struct buffer_head *
+read_block_bitmap(struct super_block *sb, unsigned int block_group)
+{
+	struct ext4_group_desc * desc;
+	struct buffer_head * bh = NULL;
+
+	desc = ext4_get_group_desc (sb, block_group, NULL);
+	if (!desc)
+		goto error_out;
+	bh = sb_bread(sb, ext4_block_bitmap(sb, desc));
+	if (!bh)
+		ext4_error (sb, "read_block_bitmap",
+			    "Cannot read block bitmap - "
+			    "block_group = %d, block_bitmap = %llu",
+			    block_group,
+			    ext4_block_bitmap(sb, desc));
+error_out:
+	return bh;
+}
+/*
+ * The reservation window structure operations
+ * --------------------------------------------
+ * Operations include:
+ * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
+ *
+ * We use a red-black tree to represent per-filesystem reservation
+ * windows.
+ *
+ */
+
+/**
+ * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
+ * @rb_root:		root of per-filesystem reservation rb tree
+ * @verbose:		verbose mode
+ * @fn:			function which wishes to dump the reservation map
+ *
+ * If verbose is turned on, it will print the whole block reservation
+ * windows(start, end).	Otherwise, it will only print out the "bad" windows,
+ * those windows that overlap with their immediate neighbors.
+ */
+#if 1
+static void __rsv_window_dump(struct rb_root *root, int verbose,
+			      const char *fn)
+{
+	struct rb_node *n;
+	struct ext4_reserve_window_node *rsv, *prev;
+	int bad;
+
+restart:
+	n = rb_first(root);
+	bad = 0;
+	prev = NULL;
+
+	printk("Block Allocation Reservation Windows Map (%s):\n", fn);
+	while (n) {
+		rsv = list_entry(n, struct ext4_reserve_window_node, rsv_node);
+		if (verbose)
+			printk("reservation window 0x%p "
+			       "start:  %llu, end:  %llu\n",
+			       rsv, rsv->rsv_start, rsv->rsv_end);
+		if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
+			printk("Bad reservation %p (start >= end)\n",
+			       rsv);
+			bad = 1;
+		}
+		if (prev && prev->rsv_end >= rsv->rsv_start) {
+			printk("Bad reservation %p (prev->end >= start)\n",
+			       rsv);
+			bad = 1;
+		}
+		if (bad) {
+			if (!verbose) {
+				printk("Restarting reservation walk in verbose mode\n");
+				verbose = 1;
+				goto restart;
+			}
+		}
+		n = rb_next(n);
+		prev = rsv;
+	}
+	printk("Window map complete.\n");
+	if (bad)
+		BUG();
+}
+#define rsv_window_dump(root, verbose) \
+	__rsv_window_dump((root), (verbose), __FUNCTION__)
+#else
+#define rsv_window_dump(root, verbose) do {} while (0)
+#endif
+
+/**
+ * goal_in_my_reservation()
+ * @rsv:		inode's reservation window
+ * @grp_goal:		given goal block relative to the allocation block group
+ * @group:		the current allocation block group
+ * @sb:			filesystem super block
+ *
+ * Test if the given goal block (group relative) is within the file's
+ * own block reservation window range.
+ *
+ * If the reservation window is outside the goal allocation group, return 0;
+ * grp_goal (given goal block) could be -1, which means no specific
+ * goal block. In this case, always return 1.
+ * If the goal block is within the reservation window, return 1;
+ * otherwise, return 0;
+ */
+static int
+goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
+			unsigned int group, struct super_block * sb)
+{
+	ext4_fsblk_t group_first_block, group_last_block;
+
+	group_first_block = ext4_group_first_block_no(sb, group);
+	group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
+
+	if ((rsv->_rsv_start > group_last_block) ||
+	    (rsv->_rsv_end < group_first_block))
+		return 0;
+	if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
+		|| (grp_goal + group_first_block > rsv->_rsv_end)))
+		return 0;
+	return 1;
+}
+
+/**
+ * search_reserve_window()
+ * @rb_root:		root of reservation tree
+ * @goal:		target allocation block
+ *
+ * Find the reserved window which includes the goal, or the previous one
+ * if the goal is not in any window.
+ * Returns NULL if there are no windows or if all windows start after the goal.
+ */
+static struct ext4_reserve_window_node *
+search_reserve_window(struct rb_root *root, ext4_fsblk_t goal)
+{
+	struct rb_node *n = root->rb_node;
+	struct ext4_reserve_window_node *rsv;
+
+	if (!n)
+		return NULL;
+
+	do {
+		rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
+
+		if (goal < rsv->rsv_start)
+			n = n->rb_left;
+		else if (goal > rsv->rsv_end)
+			n = n->rb_right;
+		else
+			return rsv;
+	} while (n);
+	/*
+	 * We've fallen off the end of the tree: the goal wasn't inside
+	 * any particular node.  OK, the previous node must be to one
+	 * side of the interval containing the goal.  If it's the RHS,
+	 * we need to back up one.
+	 */
+	if (rsv->rsv_start > goal) {
+		n = rb_prev(&rsv->rsv_node);
+		rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
+	}
+	return rsv;
+}
+
+/**
+ * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree.
+ * @sb:			super block
+ * @rsv:		reservation window to add
+ *
+ * Must be called with rsv_lock hold.
+ */
+void ext4_rsv_window_add(struct super_block *sb,
+		    struct ext4_reserve_window_node *rsv)
+{
+	struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root;
+	struct rb_node *node = &rsv->rsv_node;
+	ext4_fsblk_t start = rsv->rsv_start;
+
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct ext4_reserve_window_node *this;
+
+	while (*p)
+	{
+		parent = *p;
+		this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node);
+
+		if (start < this->rsv_start)
+			p = &(*p)->rb_left;
+		else if (start > this->rsv_end)
+			p = &(*p)->rb_right;
+		else {
+			rsv_window_dump(root, 1);
+			BUG();
+		}
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+}
+
+/**
+ * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree
+ * @sb:			super block
+ * @rsv:		reservation window to remove
+ *
+ * Mark the block reservation window as not allocated, and unlink it
+ * from the filesystem reservation window rb tree. Must be called with
+ * rsv_lock hold.
+ */
+static void rsv_window_remove(struct super_block *sb,
+			      struct ext4_reserve_window_node *rsv)
+{
+	rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+	rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+	rsv->rsv_alloc_hit = 0;
+	rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root);
+}
+
+/*
+ * rsv_is_empty() -- Check if the reservation window is allocated.
+ * @rsv:		given reservation window to check
+ *
+ * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
+ */
+static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
+{
+	/* a valid reservation end block could not be 0 */
+	return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+}
+
+/**
+ * ext4_init_block_alloc_info()
+ * @inode:		file inode structure
+ *
+ * Allocate and initialize the	reservation window structure, and
+ * link the window to the ext4 inode structure at last
+ *
+ * The reservation window structure is only dynamically allocated
+ * and linked to ext4 inode the first time the open file
+ * needs a new block. So, before every ext4_new_block(s) call, for
+ * regular files, we should check whether the reservation window
+ * structure exists or not. In the latter case, this function is called.
+ * Fail to do so will result in block reservation being turned off for that
+ * open file.
+ *
+ * This function is called from ext4_get_blocks_handle(), also called
+ * when setting the reservation window size through ioctl before the file
+ * is open for write (needs block allocation).
+ *
+ * Needs truncate_mutex protection prior to call this function.
+ */
+void ext4_init_block_alloc_info(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
+	struct super_block *sb = inode->i_sb;
+
+	block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
+	if (block_i) {
+		struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node;
+
+		rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+		rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+
+		/*
+		 * if filesystem is mounted with NORESERVATION, the goal
+		 * reservation window size is set to zero to indicate
+		 * block reservation is off
+		 */
+		if (!test_opt(sb, RESERVATION))
+			rsv->rsv_goal_size = 0;
+		else
+			rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS;
+		rsv->rsv_alloc_hit = 0;
+		block_i->last_alloc_logical_block = 0;
+		block_i->last_alloc_physical_block = 0;
+	}
+	ei->i_block_alloc_info = block_i;
+}
+
+/**
+ * ext4_discard_reservation()
+ * @inode:		inode
+ *
+ * Discard(free) block reservation window on last file close, or truncate
+ * or at last iput().
+ *
+ * It is being called in three cases:
+ *	ext4_release_file(): last writer close the file
+ *	ext4_clear_inode(): last iput(), when nobody link to this file.
+ *	ext4_truncate(): when the block indirect map is about to change.
+ *
+ */
+void ext4_discard_reservation(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
+	struct ext4_reserve_window_node *rsv;
+	spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
+
+	if (!block_i)
+		return;
+
+	rsv = &block_i->rsv_window_node;
+	if (!rsv_is_empty(&rsv->rsv_window)) {
+		spin_lock(rsv_lock);
+		if (!rsv_is_empty(&rsv->rsv_window))
+			rsv_window_remove(inode->i_sb, rsv);
+		spin_unlock(rsv_lock);
+	}
+}
+
+/**
+ * ext4_free_blocks_sb() -- Free given blocks and update quota
+ * @handle:			handle to this transaction
+ * @sb:				super block
+ * @block:			start physcial block to free
+ * @count:			number of blocks to free
+ * @pdquot_freed_blocks:	pointer to quota
+ */
+void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
+			 ext4_fsblk_t block, unsigned long count,
+			 unsigned long *pdquot_freed_blocks)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *gd_bh;
+	unsigned long block_group;
+	ext4_grpblk_t bit;
+	unsigned long i;
+	unsigned long overflow;
+	struct ext4_group_desc * desc;
+	struct ext4_super_block * es;
+	struct ext4_sb_info *sbi;
+	int err = 0, ret;
+	ext4_grpblk_t group_freed;
+
+	*pdquot_freed_blocks = 0;
+	sbi = EXT4_SB(sb);
+	es = sbi->s_es;
+	if (block < le32_to_cpu(es->s_first_data_block) ||
+	    block + count < block ||
+	    block + count > ext4_blocks_count(es)) {
+		ext4_error (sb, "ext4_free_blocks",
+			    "Freeing blocks not in datazone - "
+			    "block = %llu, count = %lu", block, count);
+		goto error_return;
+	}
+
+	ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1);
+
+do_more:
+	overflow = 0;
+	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+	/*
+	 * Check to see if we are freeing blocks across a group
+	 * boundary.
+	 */
+	if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
+		overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
+		count -= overflow;
+	}
+	brelse(bitmap_bh);
+	bitmap_bh = read_block_bitmap(sb, block_group);
+	if (!bitmap_bh)
+		goto error_return;
+	desc = ext4_get_group_desc (sb, block_group, &gd_bh);
+	if (!desc)
+		goto error_return;
+
+	if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+	    in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+	    in_range(block + count - 1, ext4_inode_table(sb, desc),
+		     sbi->s_itb_per_group))
+		ext4_error (sb, "ext4_free_blocks",
+			    "Freeing blocks in system zones - "
+			    "Block = %llu, count = %lu",
+			    block, count);
+
+	/*
+	 * We are about to start releasing blocks in the bitmap,
+	 * so we need undo access.
+	 */
+	/* @@@ check errors */
+	BUFFER_TRACE(bitmap_bh, "getting undo access");
+	err = ext4_journal_get_undo_access(handle, bitmap_bh);
+	if (err)
+		goto error_return;
+
+	/*
+	 * We are about to modify some metadata.  Call the journal APIs
+	 * to unshare ->b_data if a currently-committing transaction is
+	 * using it
+	 */
+	BUFFER_TRACE(gd_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, gd_bh);
+	if (err)
+		goto error_return;
+
+	jbd_lock_bh_state(bitmap_bh);
+
+	for (i = 0, group_freed = 0; i < count; i++) {
+		/*
+		 * An HJ special.  This is expensive...
+		 */
+#ifdef CONFIG_JBD_DEBUG
+		jbd_unlock_bh_state(bitmap_bh);
+		{
+			struct buffer_head *debug_bh;
+			debug_bh = sb_find_get_block(sb, block + i);
+			if (debug_bh) {
+				BUFFER_TRACE(debug_bh, "Deleted!");
+				if (!bh2jh(bitmap_bh)->b_committed_data)
+					BUFFER_TRACE(debug_bh,
+						"No commited data in bitmap");
+				BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
+				__brelse(debug_bh);
+			}
+		}
+		jbd_lock_bh_state(bitmap_bh);
+#endif
+		if (need_resched()) {
+			jbd_unlock_bh_state(bitmap_bh);
+			cond_resched();
+			jbd_lock_bh_state(bitmap_bh);
+		}
+		/* @@@ This prevents newly-allocated data from being
+		 * freed and then reallocated within the same
+		 * transaction.
+		 *
+		 * Ideally we would want to allow that to happen, but to
+		 * do so requires making jbd2_journal_forget() capable of
+		 * revoking the queued write of a data block, which
+		 * implies blocking on the journal lock.  *forget()
+		 * cannot block due to truncate races.
+		 *
+		 * Eventually we can fix this by making jbd2_journal_forget()
+		 * return a status indicating whether or not it was able
+		 * to revoke the buffer.  On successful revoke, it is
+		 * safe not to set the allocation bit in the committed
+		 * bitmap, because we know that there is no outstanding
+		 * activity on the buffer any more and so it is safe to
+		 * reallocate it.
+		 */
+		BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
+		J_ASSERT_BH(bitmap_bh,
+				bh2jh(bitmap_bh)->b_committed_data != NULL);
+		ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
+				bh2jh(bitmap_bh)->b_committed_data);
+
+		/*
+		 * We clear the bit in the bitmap after setting the committed
+		 * data bit, because this is the reverse order to that which
+		 * the allocator uses.
+		 */
+		BUFFER_TRACE(bitmap_bh, "clear bit");
+		if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+						bit + i, bitmap_bh->b_data)) {
+			jbd_unlock_bh_state(bitmap_bh);
+			ext4_error(sb, __FUNCTION__,
+				   "bit already cleared for block %llu",
+				   (ext4_fsblk_t)(block + i));
+			jbd_lock_bh_state(bitmap_bh);
+			BUFFER_TRACE(bitmap_bh, "bit already cleared");
+		} else {
+			group_freed++;
+		}
+	}
+	jbd_unlock_bh_state(bitmap_bh);
+
+	spin_lock(sb_bgl_lock(sbi, block_group));
+	desc->bg_free_blocks_count =
+		cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) +
+			group_freed);
+	spin_unlock(sb_bgl_lock(sbi, block_group));
+	percpu_counter_mod(&sbi->s_freeblocks_counter, count);
+
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+
+	/* And the group descriptor block */
+	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+	ret = ext4_journal_dirty_metadata(handle, gd_bh);
+	if (!err) err = ret;
+	*pdquot_freed_blocks += group_freed;
+
+	if (overflow && !err) {
+		block += count;
+		count = overflow;
+		goto do_more;
+	}
+	sb->s_dirt = 1;
+error_return:
+	brelse(bitmap_bh);
+	ext4_std_error(sb, err);
+	return;
+}
+
+/**
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle:		handle for this transaction
+ * @inode:		inode
+ * @block:		start physical block to free
+ * @count:		number of blocks to count
+ */
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
+			ext4_fsblk_t block, unsigned long count)
+{
+	struct super_block * sb;
+	unsigned long dquot_freed_blocks;
+
+	sb = inode->i_sb;
+	if (!sb) {
+		printk ("ext4_free_blocks: nonexistent device");
+		return;
+	}
+	ext4_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
+	if (dquot_freed_blocks)
+		DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+	return;
+}
+
+/**
+ * ext4_test_allocatable()
+ * @nr:			given allocation block group
+ * @bh:			bufferhead contains the bitmap of the given block group
+ *
+ * For ext4 allocations, we must not reuse any blocks which are
+ * allocated in the bitmap buffer's "last committed data" copy.  This
+ * prevents deletes from freeing up the page for reuse until we have
+ * committed the delete transaction.
+ *
+ * If we didn't do this, then deleting something and reallocating it as
+ * data would allow the old block to be overwritten before the
+ * transaction committed (because we force data to disk before commit).
+ * This would lead to corruption if we crashed between overwriting the
+ * data and committing the delete.
+ *
+ * @@@ We may want to make this allocation behaviour conditional on
+ * data-writes at some point, and disable it for metadata allocations or
+ * sync-data inodes.
+ */
+static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh)
+{
+	int ret;
+	struct journal_head *jh = bh2jh(bh);
+
+	if (ext4_test_bit(nr, bh->b_data))
+		return 0;
+
+	jbd_lock_bh_state(bh);
+	if (!jh->b_committed_data)
+		ret = 1;
+	else
+		ret = !ext4_test_bit(nr, jh->b_committed_data);
+	jbd_unlock_bh_state(bh);
+	return ret;
+}
+
+/**
+ * bitmap_search_next_usable_block()
+ * @start:		the starting block (group relative) of the search
+ * @bh:			bufferhead contains the block group bitmap
+ * @maxblocks:		the ending block (group relative) of the reservation
+ *
+ * The bitmap search --- search forward alternately through the actual
+ * bitmap on disk and the last-committed copy in journal, until we find a
+ * bit free in both bitmaps.
+ */
+static ext4_grpblk_t
+bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
+					ext4_grpblk_t maxblocks)
+{
+	ext4_grpblk_t next;
+	struct journal_head *jh = bh2jh(bh);
+
+	while (start < maxblocks) {
+		next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start);
+		if (next >= maxblocks)
+			return -1;
+		if (ext4_test_allocatable(next, bh))
+			return next;
+		jbd_lock_bh_state(bh);
+		if (jh->b_committed_data)
+			start = ext4_find_next_zero_bit(jh->b_committed_data,
+							maxblocks, next);
+		jbd_unlock_bh_state(bh);
+	}
+	return -1;
+}
+
+/**
+ * find_next_usable_block()
+ * @start:		the starting block (group relative) to find next
+ *			allocatable block in bitmap.
+ * @bh:			bufferhead contains the block group bitmap
+ * @maxblocks:		the ending block (group relative) for the search
+ *
+ * Find an allocatable block in a bitmap.  We honor both the bitmap and
+ * its last-committed copy (if that exists), and perform the "most
+ * appropriate allocation" algorithm of looking for a free block near
+ * the initial goal; then for a free byte somewhere in the bitmap; then
+ * for any free bit in the bitmap.
+ */
+static ext4_grpblk_t
+find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
+			ext4_grpblk_t maxblocks)
+{
+	ext4_grpblk_t here, next;
+	char *p, *r;
+
+	if (start > 0) {
+		/*
+		 * The goal was occupied; search forward for a free
+		 * block within the next XX blocks.
+		 *
+		 * end_goal is more or less random, but it has to be
+		 * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the
+		 * next 64-bit boundary is simple..
+		 */
+		ext4_grpblk_t end_goal = (start + 63) & ~63;
+		if (end_goal > maxblocks)
+			end_goal = maxblocks;
+		here = ext4_find_next_zero_bit(bh->b_data, end_goal, start);
+		if (here < end_goal && ext4_test_allocatable(here, bh))
+			return here;
+		ext4_debug("Bit not found near goal\n");
+	}
+
+	here = start;
+	if (here < 0)
+		here = 0;
+
+	p = ((char *)bh->b_data) + (here >> 3);
+	r = memscan(p, 0, (maxblocks - here + 7) >> 3);
+	next = (r - ((char *)bh->b_data)) << 3;
+
+	if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
+		return next;
+
+	/*
+	 * The bitmap search --- search forward alternately through the actual
+	 * bitmap and the last-committed copy until we find a bit free in
+	 * both
+	 */
+	here = bitmap_search_next_usable_block(here, bh, maxblocks);
+	return here;
+}
+
+/**
+ * claim_block()
+ * @block:		the free block (group relative) to allocate
+ * @bh:			the bufferhead containts the block group bitmap
+ *
+ * We think we can allocate this block in this bitmap.  Try to set the bit.
+ * If that succeeds then check that nobody has allocated and then freed the
+ * block since we saw that is was not marked in b_committed_data.  If it _was_
+ * allocated and freed then clear the bit in the bitmap again and return
+ * zero (failure).
+ */
+static inline int
+claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
+{
+	struct journal_head *jh = bh2jh(bh);
+	int ret;
+
+	if (ext4_set_bit_atomic(lock, block, bh->b_data))
+		return 0;
+	jbd_lock_bh_state(bh);
+	if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) {
+		ext4_clear_bit_atomic(lock, block, bh->b_data);
+		ret = 0;
+	} else {
+		ret = 1;
+	}
+	jbd_unlock_bh_state(bh);
+	return ret;
+}
+
+/**
+ * ext4_try_to_allocate()
+ * @sb:			superblock
+ * @handle:		handle to this transaction
+ * @group:		given allocation block group
+ * @bitmap_bh:		bufferhead holds the block bitmap
+ * @grp_goal:		given target block within the group
+ * @count:		target number of blocks to allocate
+ * @my_rsv:		reservation window
+ *
+ * Attempt to allocate blocks within a give range. Set the range of allocation
+ * first, then find the first free bit(s) from the bitmap (within the range),
+ * and at last, allocate the blocks by claiming the found free bit as allocated.
+ *
+ * To set the range of this allocation:
+ *	if there is a reservation window, only try to allocate block(s) from the
+ *	file's own reservation window;
+ *	Otherwise, the allocation range starts from the give goal block, ends at
+ *	the block group's last block.
+ *
+ * If we failed to allocate the desired block then we may end up crossing to a
+ * new bitmap.  In that case we must release write access to the old one via
+ * ext4_journal_release_buffer(), else we'll run out of credits.
+ */
+static ext4_grpblk_t
+ext4_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
+			struct buffer_head *bitmap_bh, ext4_grpblk_t grp_goal,
+			unsigned long *count, struct ext4_reserve_window *my_rsv)
+{
+	ext4_fsblk_t group_first_block;
+	ext4_grpblk_t start, end;
+	unsigned long num = 0;
+
+	/* we do allocation within the reservation window if we have a window */
+	if (my_rsv) {
+		group_first_block = ext4_group_first_block_no(sb, group);
+		if (my_rsv->_rsv_start >= group_first_block)
+			start = my_rsv->_rsv_start - group_first_block;
+		else
+			/* reservation window cross group boundary */
+			start = 0;
+		end = my_rsv->_rsv_end - group_first_block + 1;
+		if (end > EXT4_BLOCKS_PER_GROUP(sb))
+			/* reservation window crosses group boundary */
+			end = EXT4_BLOCKS_PER_GROUP(sb);
+		if ((start <= grp_goal) && (grp_goal < end))
+			start = grp_goal;
+		else
+			grp_goal = -1;
+	} else {
+		if (grp_goal > 0)
+			start = grp_goal;
+		else
+			start = 0;
+		end = EXT4_BLOCKS_PER_GROUP(sb);
+	}
+
+	BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb));
+
+repeat:
+	if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) {
+		grp_goal = find_next_usable_block(start, bitmap_bh, end);
+		if (grp_goal < 0)
+			goto fail_access;
+		if (!my_rsv) {
+			int i;
+
+			for (i = 0; i < 7 && grp_goal > start &&
+					ext4_test_allocatable(grp_goal - 1,
+								bitmap_bh);
+					i++, grp_goal--)
+				;
+		}
+	}
+	start = grp_goal;
+
+	if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group),
+		grp_goal, bitmap_bh)) {
+		/*
+		 * The block was allocated by another thread, or it was
+		 * allocated and then freed by another thread
+		 */
+		start++;
+		grp_goal++;
+		if (start >= end)
+			goto fail_access;
+		goto repeat;
+	}
+	num++;
+	grp_goal++;
+	while (num < *count && grp_goal < end
+		&& ext4_test_allocatable(grp_goal, bitmap_bh)
+		&& claim_block(sb_bgl_lock(EXT4_SB(sb), group),
+				grp_goal, bitmap_bh)) {
+		num++;
+		grp_goal++;
+	}
+	*count = num;
+	return grp_goal - num;
+fail_access:
+	*count = num;
+	return -1;
+}
+
+/**
+ *	find_next_reservable_window():
+ *		find a reservable space within the given range.
+ *		It does not allocate the reservation window for now:
+ *		alloc_new_reservation() will do the work later.
+ *
+ *	@search_head: the head of the searching list;
+ *		This is not necessarily the list head of the whole filesystem
+ *
+ *		We have both head and start_block to assist the search
+ *		for the reservable space. The list starts from head,
+ *		but we will shift to the place where start_block is,
+ *		then start from there, when looking for a reservable space.
+ *
+ *	@size: the target new reservation window size
+ *
+ *	@group_first_block: the first block we consider to start
+ *			the real search from
+ *
+ *	@last_block:
+ *		the maximum block number that our goal reservable space
+ *		could start from. This is normally the last block in this
+ *		group. The search will end when we found the start of next
+ *		possible reservable space is out of this boundary.
+ *		This could handle the cross boundary reservation window
+ *		request.
+ *
+ *	basically we search from the given range, rather than the whole
+ *	reservation double linked list, (start_block, last_block)
+ *	to find a free region that is of my size and has not
+ *	been reserved.
+ *
+ */
+static int find_next_reservable_window(
+				struct ext4_reserve_window_node *search_head,
+				struct ext4_reserve_window_node *my_rsv,
+				struct super_block * sb,
+				ext4_fsblk_t start_block,
+				ext4_fsblk_t last_block)
+{
+	struct rb_node *next;
+	struct ext4_reserve_window_node *rsv, *prev;
+	ext4_fsblk_t cur;
+	int size = my_rsv->rsv_goal_size;
+
+	/* TODO: make the start of the reservation window byte-aligned */
+	/* cur = *start_block & ~7;*/
+	cur = start_block;
+	rsv = search_head;
+	if (!rsv)
+		return -1;
+
+	while (1) {
+		if (cur <= rsv->rsv_end)
+			cur = rsv->rsv_end + 1;
+
+		/* TODO?
+		 * in the case we could not find a reservable space
+		 * that is what is expected, during the re-search, we could
+		 * remember what's the largest reservable space we could have
+		 * and return that one.
+		 *
+		 * For now it will fail if we could not find the reservable
+		 * space with expected-size (or more)...
+		 */
+		if (cur > last_block)
+			return -1;		/* fail */
+
+		prev = rsv;
+		next = rb_next(&rsv->rsv_node);
+		rsv = list_entry(next,struct ext4_reserve_window_node,rsv_node);
+
+		/*
+		 * Reached the last reservation, we can just append to the
+		 * previous one.
+		 */
+		if (!next)
+			break;
+
+		if (cur + size <= rsv->rsv_start) {
+			/*
+			 * Found a reserveable space big enough.  We could
+			 * have a reservation across the group boundary here
+			 */
+			break;
+		}
+	}
+	/*
+	 * we come here either :
+	 * when we reach the end of the whole list,
+	 * and there is empty reservable space after last entry in the list.
+	 * append it to the end of the list.
+	 *
+	 * or we found one reservable space in the middle of the list,
+	 * return the reservation window that we could append to.
+	 * succeed.
+	 */
+
+	if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
+		rsv_window_remove(sb, my_rsv);
+
+	/*
+	 * Let's book the whole avaliable window for now.  We will check the
+	 * disk bitmap later and then, if there are free blocks then we adjust
+	 * the window size if it's larger than requested.
+	 * Otherwise, we will remove this node from the tree next time
+	 * call find_next_reservable_window.
+	 */
+	my_rsv->rsv_start = cur;
+	my_rsv->rsv_end = cur + size - 1;
+	my_rsv->rsv_alloc_hit = 0;
+
+	if (prev != my_rsv)
+		ext4_rsv_window_add(sb, my_rsv);
+
+	return 0;
+}
+
+/**
+ *	alloc_new_reservation()--allocate a new reservation window
+ *
+ *		To make a new reservation, we search part of the filesystem
+ *		reservation list (the list that inside the group). We try to
+ *		allocate a new reservation window near the allocation goal,
+ *		or the beginning of the group, if there is no goal.
+ *
+ *		We first find a reservable space after the goal, then from
+ *		there, we check the bitmap for the first free block after
+ *		it. If there is no free block until the end of group, then the
+ *		whole group is full, we failed. Otherwise, check if the free
+ *		block is inside the expected reservable space, if so, we
+ *		succeed.
+ *		If the first free block is outside the reservable space, then
+ *		start from the first free block, we search for next available
+ *		space, and go on.
+ *
+ *	on succeed, a new reservation will be found and inserted into the list
+ *	It contains at least one free block, and it does not overlap with other
+ *	reservation windows.
+ *
+ *	failed: we failed to find a reservation window in this group
+ *
+ *	@rsv: the reservation
+ *
+ *	@grp_goal: The goal (group-relative).  It is where the search for a
+ *		free reservable space should start from.
+ *		if we have a grp_goal(grp_goal >0 ), then start from there,
+ *		no grp_goal(grp_goal = -1), we start from the first block
+ *		of the group.
+ *
+ *	@sb: the super block
+ *	@group: the group we are trying to allocate in
+ *	@bitmap_bh: the block group block bitmap
+ *
+ */
+static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
+		ext4_grpblk_t grp_goal, struct super_block *sb,
+		unsigned int group, struct buffer_head *bitmap_bh)
+{
+	struct ext4_reserve_window_node *search_head;
+	ext4_fsblk_t group_first_block, group_end_block, start_block;
+	ext4_grpblk_t first_free_block;
+	struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root;
+	unsigned long size;
+	int ret;
+	spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
+
+	group_first_block = ext4_group_first_block_no(sb, group);
+	group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
+
+	if (grp_goal < 0)
+		start_block = group_first_block;
+	else
+		start_block = grp_goal + group_first_block;
+
+	size = my_rsv->rsv_goal_size;
+
+	if (!rsv_is_empty(&my_rsv->rsv_window)) {
+		/*
+		 * if the old reservation is cross group boundary
+		 * and if the goal is inside the old reservation window,
+		 * we will come here when we just failed to allocate from
+		 * the first part of the window. We still have another part
+		 * that belongs to the next group. In this case, there is no
+		 * point to discard our window and try to allocate a new one
+		 * in this group(which will fail). we should
+		 * keep the reservation window, just simply move on.
+		 *
+		 * Maybe we could shift the start block of the reservation
+		 * window to the first block of next group.
+		 */
+
+		if ((my_rsv->rsv_start <= group_end_block) &&
+				(my_rsv->rsv_end > group_end_block) &&
+				(start_block >= my_rsv->rsv_start))
+			return -1;
+
+		if ((my_rsv->rsv_alloc_hit >
+		     (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
+			/*
+			 * if the previously allocation hit ratio is
+			 * greater than 1/2, then we double the size of
+			 * the reservation window the next time,
+			 * otherwise we keep the same size window
+			 */
+			size = size * 2;
+			if (size > EXT4_MAX_RESERVE_BLOCKS)
+				size = EXT4_MAX_RESERVE_BLOCKS;
+			my_rsv->rsv_goal_size= size;
+		}
+	}
+
+	spin_lock(rsv_lock);
+	/*
+	 * shift the search start to the window near the goal block
+	 */
+	search_head = search_reserve_window(fs_rsv_root, start_block);
+
+	/*
+	 * find_next_reservable_window() simply finds a reservable window
+	 * inside the given range(start_block, group_end_block).
+	 *
+	 * To make sure the reservation window has a free bit inside it, we
+	 * need to check the bitmap after we found a reservable window.
+	 */
+retry:
+	ret = find_next_reservable_window(search_head, my_rsv, sb,
+						start_block, group_end_block);
+
+	if (ret == -1) {
+		if (!rsv_is_empty(&my_rsv->rsv_window))
+			rsv_window_remove(sb, my_rsv);
+		spin_unlock(rsv_lock);
+		return -1;
+	}
+
+	/*
+	 * On success, find_next_reservable_window() returns the
+	 * reservation window where there is a reservable space after it.
+	 * Before we reserve this reservable space, we need
+	 * to make sure there is at least a free block inside this region.
+	 *
+	 * searching the first free bit on the block bitmap and copy of
+	 * last committed bitmap alternatively, until we found a allocatable
+	 * block. Search start from the start block of the reservable space
+	 * we just found.
+	 */
+	spin_unlock(rsv_lock);
+	first_free_block = bitmap_search_next_usable_block(
+			my_rsv->rsv_start - group_first_block,
+			bitmap_bh, group_end_block - group_first_block + 1);
+
+	if (first_free_block < 0) {
+		/*
+		 * no free block left on the bitmap, no point
+		 * to reserve the space. return failed.
+		 */
+		spin_lock(rsv_lock);
+		if (!rsv_is_empty(&my_rsv->rsv_window))
+			rsv_window_remove(sb, my_rsv);
+		spin_unlock(rsv_lock);
+		return -1;		/* failed */
+	}
+
+	start_block = first_free_block + group_first_block;
+	/*
+	 * check if the first free block is within the
+	 * free space we just reserved
+	 */
+	if (start_block >= my_rsv->rsv_start && start_block < my_rsv->rsv_end)
+		return 0;		/* success */
+	/*
+	 * if the first free bit we found is out of the reservable space
+	 * continue search for next reservable space,
+	 * start from where the free block is,
+	 * we also shift the list head to where we stopped last time
+	 */
+	search_head = my_rsv;
+	spin_lock(rsv_lock);
+	goto retry;
+}
+
+/**
+ * try_to_extend_reservation()
+ * @my_rsv:		given reservation window
+ * @sb:			super block
+ * @size:		the delta to extend
+ *
+ * Attempt to expand the reservation window large enough to have
+ * required number of free blocks
+ *
+ * Since ext4_try_to_allocate() will always allocate blocks within
+ * the reservation window range, if the window size is too small,
+ * multiple blocks allocation has to stop at the end of the reservation
+ * window. To make this more efficient, given the total number of
+ * blocks needed and the current size of the window, we try to
+ * expand the reservation window size if necessary on a best-effort
+ * basis before ext4_new_blocks() tries to allocate blocks,
+ */
+static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
+			struct super_block *sb, int size)
+{
+	struct ext4_reserve_window_node *next_rsv;
+	struct rb_node *next;
+	spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
+
+	if (!spin_trylock(rsv_lock))
+		return;
+
+	next = rb_next(&my_rsv->rsv_node);
+
+	if (!next)
+		my_rsv->rsv_end += size;
+	else {
+		next_rsv = list_entry(next, struct ext4_reserve_window_node, rsv_node);
+
+		if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
+			my_rsv->rsv_end += size;
+		else
+			my_rsv->rsv_end = next_rsv->rsv_start - 1;
+	}
+	spin_unlock(rsv_lock);
+}
+
+/**
+ * ext4_try_to_allocate_with_rsv()
+ * @sb:			superblock
+ * @handle:		handle to this transaction
+ * @group:		given allocation block group
+ * @bitmap_bh:		bufferhead holds the block bitmap
+ * @grp_goal:		given target block within the group
+ * @count:		target number of blocks to allocate
+ * @my_rsv:		reservation window
+ * @errp:		pointer to store the error code
+ *
+ * This is the main function used to allocate a new block and its reservation
+ * window.
+ *
+ * Each time when a new block allocation is need, first try to allocate from
+ * its own reservation.  If it does not have a reservation window, instead of
+ * looking for a free bit on bitmap first, then look up the reservation list to
+ * see if it is inside somebody else's reservation window, we try to allocate a
+ * reservation window for it starting from the goal first. Then do the block
+ * allocation within the reservation window.
+ *
+ * This will avoid keeping on searching the reservation list again and
+ * again when somebody is looking for a free block (without
+ * reservation), and there are lots of free blocks, but they are all
+ * being reserved.
+ *
+ * We use a red-black tree for the per-filesystem reservation list.
+ *
+ */
+static ext4_grpblk_t
+ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
+			unsigned int group, struct buffer_head *bitmap_bh,
+			ext4_grpblk_t grp_goal,
+			struct ext4_reserve_window_node * my_rsv,
+			unsigned long *count, int *errp)
+{
+	ext4_fsblk_t group_first_block, group_last_block;
+	ext4_grpblk_t ret = 0;
+	int fatal;
+	unsigned long num = *count;
+
+	*errp = 0;
+
+	/*
+	 * Make sure we use undo access for the bitmap, because it is critical
+	 * that we do the frozen_data COW on bitmap buffers in all cases even
+	 * if the buffer is in BJ_Forget state in the committing transaction.
+	 */
+	BUFFER_TRACE(bitmap_bh, "get undo access for new block");
+	fatal = ext4_journal_get_undo_access(handle, bitmap_bh);
+	if (fatal) {
+		*errp = fatal;
+		return -1;
+	}
+
+	/*
+	 * we don't deal with reservation when
+	 * filesystem is mounted without reservation
+	 * or the file is not a regular file
+	 * or last attempt to allocate a block with reservation turned on failed
+	 */
+	if (my_rsv == NULL ) {
+		ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
+						grp_goal, count, NULL);
+		goto out;
+	}
+	/*
+	 * grp_goal is a group relative block number (if there is a goal)
+	 * 0 < grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
+	 * first block is a filesystem wide block number
+	 * first block is the block number of the first block in this group
+	 */
+	group_first_block = ext4_group_first_block_no(sb, group);
+	group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
+
+	/*
+	 * Basically we will allocate a new block from inode's reservation
+	 * window.
+	 *
+	 * We need to allocate a new reservation window, if:
+	 * a) inode does not have a reservation window; or
+	 * b) last attempt to allocate a block from existing reservation
+	 *    failed; or
+	 * c) we come here with a goal and with a reservation window
+	 *
+	 * We do not need to allocate a new reservation window if we come here
+	 * at the beginning with a goal and the goal is inside the window, or
+	 * we don't have a goal but already have a reservation window.
+	 * then we could go to allocate from the reservation window directly.
+	 */
+	while (1) {
+		if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
+			!goal_in_my_reservation(&my_rsv->rsv_window,
+						grp_goal, group, sb)) {
+			if (my_rsv->rsv_goal_size < *count)
+				my_rsv->rsv_goal_size = *count;
+			ret = alloc_new_reservation(my_rsv, grp_goal, sb,
+							group, bitmap_bh);
+			if (ret < 0)
+				break;			/* failed */
+
+			if (!goal_in_my_reservation(&my_rsv->rsv_window,
+							grp_goal, group, sb))
+				grp_goal = -1;
+		} else if (grp_goal > 0 &&
+			  (my_rsv->rsv_end-grp_goal+1) < *count)
+			try_to_extend_reservation(my_rsv, sb,
+					*count-my_rsv->rsv_end + grp_goal - 1);
+
+		if ((my_rsv->rsv_start > group_last_block) ||
+				(my_rsv->rsv_end < group_first_block)) {
+			rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1);
+			BUG();
+		}
+		ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
+					   grp_goal, &num, &my_rsv->rsv_window);
+		if (ret >= 0) {
+			my_rsv->rsv_alloc_hit += num;
+			*count = num;
+			break;				/* succeed */
+		}
+		num = *count;
+	}
+out:
+	if (ret >= 0) {
+		BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
+					"bitmap block");
+		fatal = ext4_journal_dirty_metadata(handle, bitmap_bh);
+		if (fatal) {
+			*errp = fatal;
+			return -1;
+		}
+		return ret;
+	}
+
+	BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
+	ext4_journal_release_buffer(handle, bitmap_bh);
+	return ret;
+}
+
+/**
+ * ext4_has_free_blocks()
+ * @sbi:		in-core super block structure.
+ *
+ * Check if filesystem has at least 1 free block available for allocation.
+ */
+static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
+{
+	ext4_fsblk_t free_blocks, root_blocks;
+
+	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	root_blocks = ext4_r_blocks_count(sbi->s_es);
+	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+		sbi->s_resuid != current->fsuid &&
+		(sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
+		return 0;
+	}
+	return 1;
+}
+
+/**
+ * ext4_should_retry_alloc()
+ * @sb:			super block
+ * @retries		number of attemps has been made
+ *
+ * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
+ * it is profitable to retry the operation, this function will wait
+ * for the current or commiting transaction to complete, and then
+ * return TRUE.
+ *
+ * if the total number of retries exceed three times, return FALSE.
+ */
+int ext4_should_retry_alloc(struct super_block *sb, int *retries)
+{
+	if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
+		return 0;
+
+	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
+
+	return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
+}
+
+/**
+ * ext4_new_blocks() -- core block(s) allocation function
+ * @handle:		handle to this transaction
+ * @inode:		file inode
+ * @goal:		given target block(filesystem wide)
+ * @count:		target number of blocks to allocate
+ * @errp:		error code
+ *
+ * ext4_new_blocks uses a goal block to assist allocation.  It tries to
+ * allocate block(s) from the block group contains the goal block first. If that
+ * fails, it will try to allocate block(s) from other block groups without
+ * any specific goal block.
+ *
+ */
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+			ext4_fsblk_t goal, unsigned long *count, int *errp)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *gdp_bh;
+	unsigned long group_no;
+	int goal_group;
+	ext4_grpblk_t grp_target_blk;	/* blockgroup relative goal block */
+	ext4_grpblk_t grp_alloc_blk;	/* blockgroup-relative allocated block*/
+	ext4_fsblk_t ret_block;		/* filesyetem-wide allocated block */
+	int bgi;			/* blockgroup iteration index */
+	int fatal = 0, err;
+	int performed_allocation = 0;
+	ext4_grpblk_t free_blocks;	/* number of free blocks in a group */
+	struct super_block *sb;
+	struct ext4_group_desc *gdp;
+	struct ext4_super_block *es;
+	struct ext4_sb_info *sbi;
+	struct ext4_reserve_window_node *my_rsv = NULL;
+	struct ext4_block_alloc_info *block_i;
+	unsigned short windowsz = 0;
+#ifdef EXT4FS_DEBUG
+	static int goal_hits, goal_attempts;
+#endif
+	unsigned long ngroups;
+	unsigned long num = *count;
+
+	*errp = -ENOSPC;
+	sb = inode->i_sb;
+	if (!sb) {
+		printk("ext4_new_block: nonexistent device");
+		return 0;
+	}
+
+	/*
+	 * Check quota for allocation of this block.
+	 */
+	if (DQUOT_ALLOC_BLOCK(inode, num)) {
+		*errp = -EDQUOT;
+		return 0;
+	}
+
+	sbi = EXT4_SB(sb);
+	es = EXT4_SB(sb)->s_es;
+	ext4_debug("goal=%lu.\n", goal);
+	/*
+	 * Allocate a block from reservation only when
+	 * filesystem is mounted with reservation(default,-o reservation), and
+	 * it's a regular file, and
+	 * the desired window size is greater than 0 (One could use ioctl
+	 * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off
+	 * reservation on that particular file)
+	 */
+	block_i = EXT4_I(inode)->i_block_alloc_info;
+	if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
+		my_rsv = &block_i->rsv_window_node;
+
+	if (!ext4_has_free_blocks(sbi)) {
+		*errp = -ENOSPC;
+		goto out;
+	}
+
+	/*
+	 * First, test whether the goal block is free.
+	 */
+	if (goal < le32_to_cpu(es->s_first_data_block) ||
+	    goal >= ext4_blocks_count(es))
+		goal = le32_to_cpu(es->s_first_data_block);
+	ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk);
+	goal_group = group_no;
+retry_alloc:
+	gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
+	if (!gdp)
+		goto io_error;
+
+	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+	/*
+	 * if there is not enough free blocks to make a new resevation
+	 * turn off reservation for this allocation
+	 */
+	if (my_rsv && (free_blocks < windowsz)
+		&& (rsv_is_empty(&my_rsv->rsv_window)))
+		my_rsv = NULL;
+
+	if (free_blocks > 0) {
+		bitmap_bh = read_block_bitmap(sb, group_no);
+		if (!bitmap_bh)
+			goto io_error;
+		grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
+					group_no, bitmap_bh, grp_target_blk,
+					my_rsv,	&num, &fatal);
+		if (fatal)
+			goto out;
+		if (grp_alloc_blk >= 0)
+			goto allocated;
+	}
+
+	ngroups = EXT4_SB(sb)->s_groups_count;
+	smp_rmb();
+
+	/*
+	 * Now search the rest of the groups.  We assume that
+	 * i and gdp correctly point to the last group visited.
+	 */
+	for (bgi = 0; bgi < ngroups; bgi++) {
+		group_no++;
+		if (group_no >= ngroups)
+			group_no = 0;
+		gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
+		if (!gdp) {
+			*errp = -EIO;
+			goto out;
+		}
+		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+		/*
+		 * skip this group if the number of
+		 * free blocks is less than half of the reservation
+		 * window size.
+		 */
+		if (free_blocks <= (windowsz/2))
+			continue;
+
+		brelse(bitmap_bh);
+		bitmap_bh = read_block_bitmap(sb, group_no);
+		if (!bitmap_bh)
+			goto io_error;
+		/*
+		 * try to allocate block(s) from this group, without a goal(-1).
+		 */
+		grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
+					group_no, bitmap_bh, -1, my_rsv,
+					&num, &fatal);
+		if (fatal)
+			goto out;
+		if (grp_alloc_blk >= 0)
+			goto allocated;
+	}
+	/*
+	 * We may end up a bogus ealier ENOSPC error due to
+	 * filesystem is "full" of reservations, but
+	 * there maybe indeed free blocks avaliable on disk
+	 * In this case, we just forget about the reservations
+	 * just do block allocation as without reservations.
+	 */
+	if (my_rsv) {
+		my_rsv = NULL;
+		group_no = goal_group;
+		goto retry_alloc;
+	}
+	/* No space left on the device */
+	*errp = -ENOSPC;
+	goto out;
+
+allocated:
+
+	ext4_debug("using block group %d(%d)\n",
+			group_no, gdp->bg_free_blocks_count);
+
+	BUFFER_TRACE(gdp_bh, "get_write_access");
+	fatal = ext4_journal_get_write_access(handle, gdp_bh);
+	if (fatal)
+		goto out;
+
+	ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
+
+	if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
+	    in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
+	    in_range(ret_block, ext4_inode_table(sb, gdp),
+		     EXT4_SB(sb)->s_itb_per_group) ||
+	    in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
+		     EXT4_SB(sb)->s_itb_per_group))
+		ext4_error(sb, "ext4_new_block",
+			    "Allocating block in system zone - "
+			    "blocks from %llu, length %lu",
+			     ret_block, num);
+
+	performed_allocation = 1;
+
+#ifdef CONFIG_JBD_DEBUG
+	{
+		struct buffer_head *debug_bh;
+
+		/* Record bitmap buffer state in the newly allocated block */
+		debug_bh = sb_find_get_block(sb, ret_block);
+		if (debug_bh) {
+			BUFFER_TRACE(debug_bh, "state when allocated");
+			BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
+			brelse(debug_bh);
+		}
+	}
+	jbd_lock_bh_state(bitmap_bh);
+	spin_lock(sb_bgl_lock(sbi, group_no));
+	if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
+		int i;
+
+		for (i = 0; i < num; i++) {
+			if (ext4_test_bit(grp_alloc_blk+i,
+					bh2jh(bitmap_bh)->b_committed_data)) {
+				printk("%s: block was unexpectedly set in "
+					"b_committed_data\n", __FUNCTION__);
+			}
+		}
+	}
+	ext4_debug("found bit %d\n", grp_alloc_blk);
+	spin_unlock(sb_bgl_lock(sbi, group_no));
+	jbd_unlock_bh_state(bitmap_bh);
+#endif
+
+	if (ret_block + num - 1 >= ext4_blocks_count(es)) {
+		ext4_error(sb, "ext4_new_block",
+			    "block(%llu) >= blocks count(%llu) - "
+			    "block_group = %lu, es == %p ", ret_block,
+			ext4_blocks_count(es), group_no, es);
+		goto out;
+	}
+
+	/*
+	 * It is up to the caller to add the new buffer to a journal
+	 * list of some description.  We don't know in advance whether
+	 * the caller wants to use it as metadata or data.
+	 */
+	ext4_debug("allocating block %lu. Goal hits %d of %d.\n",
+			ret_block, goal_hits, goal_attempts);
+
+	spin_lock(sb_bgl_lock(sbi, group_no));
+	gdp->bg_free_blocks_count =
+			cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num);
+	spin_unlock(sb_bgl_lock(sbi, group_no));
+	percpu_counter_mod(&sbi->s_freeblocks_counter, -num);
+
+	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
+	err = ext4_journal_dirty_metadata(handle, gdp_bh);
+	if (!fatal)
+		fatal = err;
+
+	sb->s_dirt = 1;
+	if (fatal)
+		goto out;
+
+	*errp = 0;
+	brelse(bitmap_bh);
+	DQUOT_FREE_BLOCK(inode, *count-num);
+	*count = num;
+	return ret_block;
+
+io_error:
+	*errp = -EIO;
+out:
+	if (fatal) {
+		*errp = fatal;
+		ext4_std_error(sb, fatal);
+	}
+	/*
+	 * Undo the block allocation
+	 */
+	if (!performed_allocation)
+		DQUOT_FREE_BLOCK(inode, *count);
+	brelse(bitmap_bh);
+	return 0;
+}
+
+ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
+			ext4_fsblk_t goal, int *errp)
+{
+	unsigned long count = 1;
+
+	return ext4_new_blocks(handle, inode, goal, &count, errp);
+}
+
+/**
+ * ext4_count_free_blocks() -- count filesystem free blocks
+ * @sb:		superblock
+ *
+ * Adds up the number of free blocks from each block group.
+ */
+ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
+{
+	ext4_fsblk_t desc_count;
+	struct ext4_group_desc *gdp;
+	int i;
+	unsigned long ngroups = EXT4_SB(sb)->s_groups_count;
+#ifdef EXT4FS_DEBUG
+	struct ext4_super_block *es;
+	ext4_fsblk_t bitmap_count;
+	unsigned long x;
+	struct buffer_head *bitmap_bh = NULL;
+
+	es = EXT4_SB(sb)->s_es;
+	desc_count = 0;
+	bitmap_count = 0;
+	gdp = NULL;
+
+	smp_rmb();
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		if (!gdp)
+			continue;
+		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+		brelse(bitmap_bh);
+		bitmap_bh = read_block_bitmap(sb, i);
+		if (bitmap_bh == NULL)
+			continue;
+
+		x = ext4_count_free(bitmap_bh, sb->s_blocksize);
+		printk("group %d: stored = %d, counted = %lu\n",
+			i, le16_to_cpu(gdp->bg_free_blocks_count), x);
+		bitmap_count += x;
+	}
+	brelse(bitmap_bh);
+	printk("ext4_count_free_blocks: stored = %llu"
+		", computed = %llu, %llu\n",
+	       EXT4_FREE_BLOCKS_COUNT(es),
+		desc_count, bitmap_count);
+	return bitmap_count;
+#else
+	desc_count = 0;
+	smp_rmb();
+	for (i = 0; i < ngroups; i++) {
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		if (!gdp)
+			continue;
+		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+	}
+
+	return desc_count;
+#endif
+}
+
+static inline int
+block_in_use(ext4_fsblk_t block, struct super_block *sb, unsigned char *map)
+{
+	ext4_grpblk_t offset;
+
+	ext4_get_group_no_and_offset(sb, block, NULL, &offset);
+	return ext4_test_bit (offset, map);
+}
+
+static inline int test_root(int a, int b)
+{
+	int num = b;
+
+	while (a > num)
+		num *= b;
+	return num == a;
+}
+
+static int ext4_group_sparse(int group)
+{
+	if (group <= 1)
+		return 1;
+	if (!(group & 1))
+		return 0;
+	return (test_root(group, 7) || test_root(group, 5) ||
+		test_root(group, 3));
+}
+
+/**
+ *	ext4_bg_has_super - number of blocks used by the superblock in group
+ *	@sb: superblock for filesystem
+ *	@group: group number to check
+ *
+ *	Return the number of blocks used by the superblock (primary or backup)
+ *	in this group.  Currently this will be only 0 or 1.
+ */
+int ext4_bg_has_super(struct super_block *sb, int group)
+{
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
+			!ext4_group_sparse(group))
+		return 0;
+	return 1;
+}
+
+static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, int group)
+{
+	unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
+	unsigned long first = metagroup * EXT4_DESC_PER_BLOCK(sb);
+	unsigned long last = first + EXT4_DESC_PER_BLOCK(sb) - 1;
+
+	if (group == first || group == first + 1 || group == last)
+		return 1;
+	return 0;
+}
+
+static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, int group)
+{
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+				EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
+			!ext4_group_sparse(group))
+		return 0;
+	return EXT4_SB(sb)->s_gdb_count;
+}
+
+/**
+ *	ext4_bg_num_gdb - number of blocks used by the group table in group
+ *	@sb: superblock for filesystem
+ *	@group: group number to check
+ *
+ *	Return the number of blocks used by the group descriptor table
+ *	(primary or backup) in this group.  In the future there may be a
+ *	different number of descriptor blocks in each group.
+ */
+unsigned long ext4_bg_num_gdb(struct super_block *sb, int group)
+{
+	unsigned long first_meta_bg =
+			le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+	unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
+
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
+			metagroup < first_meta_bg)
+		return ext4_bg_num_gdb_nometa(sb,group);
+
+	return ext4_bg_num_gdb_meta(sb,group);
+
+}
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
new file mode 100644
index 000000000000..11e93c169bcf
--- /dev/null
+++ b/fs/ext4/bitmap.c
@@ -0,0 +1,32 @@
+/*
+ *  linux/fs/ext4/bitmap.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+
+#ifdef EXT4FS_DEBUG
+
+static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
+
+unsigned long ext4_count_free (struct buffer_head * map, unsigned int numchars)
+{
+	unsigned int i;
+	unsigned long sum = 0;
+
+	if (!map)
+		return (0);
+	for (i = 0; i < numchars; i++)
+		sum += nibblemap[map->b_data[i] & 0xf] +
+			nibblemap[(map->b_data[i] >> 4) & 0xf];
+	return (sum);
+}
+
+#endif  /*  EXT4FS_DEBUG  */
+
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
new file mode 100644
index 000000000000..f8595787a70e
--- /dev/null
+++ b/fs/ext4/dir.c
@@ -0,0 +1,518 @@
+/*
+ *  linux/fs/ext4/dir.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/dir.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext4 directory handling functions
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * Hash Tree Directory indexing (c) 2001  Daniel Phillips
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/buffer_head.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+
+static unsigned char ext4_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static int ext4_readdir(struct file *, void *, filldir_t);
+static int ext4_dx_readdir(struct file * filp,
+			   void * dirent, filldir_t filldir);
+static int ext4_release_dir (struct inode * inode,
+				struct file * filp);
+
+const struct file_operations ext4_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= ext4_readdir,		/* we take BKL. needed?*/
+	.ioctl		= ext4_ioctl,		/* BKL held */
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext4_compat_ioctl,
+#endif
+	.fsync		= ext4_sync_file,	/* BKL held */
+#ifdef CONFIG_EXT4_INDEX
+	.release	= ext4_release_dir,
+#endif
+};
+
+
+static unsigned char get_dtype(struct super_block *sb, int filetype)
+{
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
+	    (filetype >= EXT4_FT_MAX))
+		return DT_UNKNOWN;
+
+	return (ext4_filetype_table[filetype]);
+}
+
+
+int ext4_check_dir_entry (const char * function, struct inode * dir,
+			  struct ext4_dir_entry_2 * de,
+			  struct buffer_head * bh,
+			  unsigned long offset)
+{
+	const char * error_msg = NULL;
+	const int rlen = le16_to_cpu(de->rec_len);
+
+	if (rlen < EXT4_DIR_REC_LEN(1))
+		error_msg = "rec_len is smaller than minimal";
+	else if (rlen % 4 != 0)
+		error_msg = "rec_len % 4 != 0";
+	else if (rlen < EXT4_DIR_REC_LEN(de->name_len))
+		error_msg = "rec_len is too small for name_len";
+	else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+		error_msg = "directory entry across blocks";
+	else if (le32_to_cpu(de->inode) >
+			le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))
+		error_msg = "inode out of bounds";
+
+	if (error_msg != NULL)
+		ext4_error (dir->i_sb, function,
+			"bad entry in directory #%lu: %s - "
+			"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+			dir->i_ino, error_msg, offset,
+			(unsigned long) le32_to_cpu(de->inode),
+			rlen, de->name_len);
+	return error_msg == NULL ? 1 : 0;
+}
+
+static int ext4_readdir(struct file * filp,
+			 void * dirent, filldir_t filldir)
+{
+	int error = 0;
+	unsigned long offset;
+	int i, stored;
+	struct ext4_dir_entry_2 *de;
+	struct super_block *sb;
+	int err;
+	struct inode *inode = filp->f_dentry->d_inode;
+	int ret = 0;
+
+	sb = inode->i_sb;
+
+#ifdef CONFIG_EXT4_INDEX
+	if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+				    EXT4_FEATURE_COMPAT_DIR_INDEX) &&
+	    ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
+	     ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
+		err = ext4_dx_readdir(filp, dirent, filldir);
+		if (err != ERR_BAD_DX_DIR) {
+			ret = err;
+			goto out;
+		}
+		/*
+		 * We don't set the inode dirty flag since it's not
+		 * critical that it get flushed back to the disk.
+		 */
+		EXT4_I(filp->f_dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
+	}
+#endif
+	stored = 0;
+	offset = filp->f_pos & (sb->s_blocksize - 1);
+
+	while (!error && !stored && filp->f_pos < inode->i_size) {
+		unsigned long blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+		struct buffer_head map_bh;
+		struct buffer_head *bh = NULL;
+
+		map_bh.b_state = 0;
+		err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
+		if (err > 0) {
+			page_cache_readahead(sb->s_bdev->bd_inode->i_mapping,
+				&filp->f_ra,
+				filp,
+				map_bh.b_blocknr >>
+					(PAGE_CACHE_SHIFT - inode->i_blkbits),
+				1);
+			bh = ext4_bread(NULL, inode, blk, 0, &err);
+		}
+
+		/*
+		 * We ignore I/O errors on directories so users have a chance
+		 * of recovering data when there's a bad sector
+		 */
+		if (!bh) {
+			ext4_error (sb, "ext4_readdir",
+				"directory #%lu contains a hole at offset %lu",
+				inode->i_ino, (unsigned long)filp->f_pos);
+			filp->f_pos += sb->s_blocksize - offset;
+			continue;
+		}
+
+revalidate:
+		/* If the dir block has changed since the last call to
+		 * readdir(2), then we might be pointing to an invalid
+		 * dirent right now.  Scan from the start of the block
+		 * to make sure. */
+		if (filp->f_version != inode->i_version) {
+			for (i = 0; i < sb->s_blocksize && i < offset; ) {
+				de = (struct ext4_dir_entry_2 *)
+					(bh->b_data + i);
+				/* It's too expensive to do a full
+				 * dirent test each time round this
+				 * loop, but we do have to test at
+				 * least that it is non-zero.  A
+				 * failure will be detected in the
+				 * dirent test below. */
+				if (le16_to_cpu(de->rec_len) <
+						EXT4_DIR_REC_LEN(1))
+					break;
+				i += le16_to_cpu(de->rec_len);
+			}
+			offset = i;
+			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+				| offset;
+			filp->f_version = inode->i_version;
+		}
+
+		while (!error && filp->f_pos < inode->i_size
+		       && offset < sb->s_blocksize) {
+			de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
+			if (!ext4_check_dir_entry ("ext4_readdir", inode, de,
+						   bh, offset)) {
+				/*
+				 * On error, skip the f_pos to the next block
+				 */
+				filp->f_pos = (filp->f_pos |
+						(sb->s_blocksize - 1)) + 1;
+				brelse (bh);
+				ret = stored;
+				goto out;
+			}
+			offset += le16_to_cpu(de->rec_len);
+			if (le32_to_cpu(de->inode)) {
+				/* We might block in the next section
+				 * if the data destination is
+				 * currently swapped out.  So, use a
+				 * version stamp to detect whether or
+				 * not the directory has been modified
+				 * during the copy operation.
+				 */
+				unsigned long version = filp->f_version;
+
+				error = filldir(dirent, de->name,
+						de->name_len,
+						filp->f_pos,
+						le32_to_cpu(de->inode),
+						get_dtype(sb, de->file_type));
+				if (error)
+					break;
+				if (version != filp->f_version)
+					goto revalidate;
+				stored ++;
+			}
+			filp->f_pos += le16_to_cpu(de->rec_len);
+		}
+		offset = 0;
+		brelse (bh);
+	}
+out:
+	return ret;
+}
+
+#ifdef CONFIG_EXT4_INDEX
+/*
+ * These functions convert from the major/minor hash to an f_pos
+ * value.
+ *
+ * Currently we only use major hash numer.  This is unfortunate, but
+ * on 32-bit machines, the same VFS interface is used for lseek and
+ * llseek, so if we use the 64 bit offset, then the 32-bit versions of
+ * lseek/telldir/seekdir will blow out spectacularly, and from within
+ * the ext2 low-level routine, we don't know if we're being called by
+ * a 64-bit version of the system call or the 32-bit version of the
+ * system call.  Worse yet, NFSv2 only allows for a 32-bit readdir
+ * cookie.  Sigh.
+ */
+#define hash2pos(major, minor)	(major >> 1)
+#define pos2maj_hash(pos)	((pos << 1) & 0xffffffff)
+#define pos2min_hash(pos)	(0)
+
+/*
+ * This structure holds the nodes of the red-black tree used to store
+ * the directory entry in hash order.
+ */
+struct fname {
+	__u32		hash;
+	__u32		minor_hash;
+	struct rb_node	rb_hash;
+	struct fname	*next;
+	__u32		inode;
+	__u8		name_len;
+	__u8		file_type;
+	char		name[0];
+};
+
+/*
+ * This functoin implements a non-recursive way of freeing all of the
+ * nodes in the red-black tree.
+ */
+static void free_rb_tree_fname(struct rb_root *root)
+{
+	struct rb_node	*n = root->rb_node;
+	struct rb_node	*parent;
+	struct fname	*fname;
+
+	while (n) {
+		/* Do the node's children first */
+		if ((n)->rb_left) {
+			n = n->rb_left;
+			continue;
+		}
+		if (n->rb_right) {
+			n = n->rb_right;
+			continue;
+		}
+		/*
+		 * The node has no children; free it, and then zero
+		 * out parent's link to it.  Finally go to the
+		 * beginning of the loop and try to free the parent
+		 * node.
+		 */
+		parent = rb_parent(n);
+		fname = rb_entry(n, struct fname, rb_hash);
+		while (fname) {
+			struct fname * old = fname;
+			fname = fname->next;
+			kfree (old);
+		}
+		if (!parent)
+			root->rb_node = NULL;
+		else if (parent->rb_left == n)
+			parent->rb_left = NULL;
+		else if (parent->rb_right == n)
+			parent->rb_right = NULL;
+		n = parent;
+	}
+	root->rb_node = NULL;
+}
+
+
+static struct dir_private_info *create_dir_info(loff_t pos)
+{
+	struct dir_private_info *p;
+
+	p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+	if (!p)
+		return NULL;
+	p->root.rb_node = NULL;
+	p->curr_node = NULL;
+	p->extra_fname = NULL;
+	p->last_pos = 0;
+	p->curr_hash = pos2maj_hash(pos);
+	p->curr_minor_hash = pos2min_hash(pos);
+	p->next_hash = 0;
+	return p;
+}
+
+void ext4_htree_free_dir_info(struct dir_private_info *p)
+{
+	free_rb_tree_fname(&p->root);
+	kfree(p);
+}
+
+/*
+ * Given a directory entry, enter it into the fname rb tree.
+ */
+int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
+			     __u32 minor_hash,
+			     struct ext4_dir_entry_2 *dirent)
+{
+	struct rb_node **p, *parent = NULL;
+	struct fname * fname, *new_fn;
+	struct dir_private_info *info;
+	int len;
+
+	info = (struct dir_private_info *) dir_file->private_data;
+	p = &info->root.rb_node;
+
+	/* Create and allocate the fname structure */
+	len = sizeof(struct fname) + dirent->name_len + 1;
+	new_fn = kzalloc(len, GFP_KERNEL);
+	if (!new_fn)
+		return -ENOMEM;
+	new_fn->hash = hash;
+	new_fn->minor_hash = minor_hash;
+	new_fn->inode = le32_to_cpu(dirent->inode);
+	new_fn->name_len = dirent->name_len;
+	new_fn->file_type = dirent->file_type;
+	memcpy(new_fn->name, dirent->name, dirent->name_len);
+	new_fn->name[dirent->name_len] = 0;
+
+	while (*p) {
+		parent = *p;
+		fname = rb_entry(parent, struct fname, rb_hash);
+
+		/*
+		 * If the hash and minor hash match up, then we put
+		 * them on a linked list.  This rarely happens...
+		 */
+		if ((new_fn->hash == fname->hash) &&
+		    (new_fn->minor_hash == fname->minor_hash)) {
+			new_fn->next = fname->next;
+			fname->next = new_fn;
+			return 0;
+		}
+
+		if (new_fn->hash < fname->hash)
+			p = &(*p)->rb_left;
+		else if (new_fn->hash > fname->hash)
+			p = &(*p)->rb_right;
+		else if (new_fn->minor_hash < fname->minor_hash)
+			p = &(*p)->rb_left;
+		else /* if (new_fn->minor_hash > fname->minor_hash) */
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&new_fn->rb_hash, parent, p);
+	rb_insert_color(&new_fn->rb_hash, &info->root);
+	return 0;
+}
+
+
+
+/*
+ * This is a helper function for ext4_dx_readdir.  It calls filldir
+ * for all entres on the fname linked list.  (Normally there is only
+ * one entry on the linked list, unless there are 62 bit hash collisions.)
+ */
+static int call_filldir(struct file * filp, void * dirent,
+			filldir_t filldir, struct fname *fname)
+{
+	struct dir_private_info *info = filp->private_data;
+	loff_t	curr_pos;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct super_block * sb;
+	int error;
+
+	sb = inode->i_sb;
+
+	if (!fname) {
+		printk("call_filldir: called with null fname?!?\n");
+		return 0;
+	}
+	curr_pos = hash2pos(fname->hash, fname->minor_hash);
+	while (fname) {
+		error = filldir(dirent, fname->name,
+				fname->name_len, curr_pos,
+				fname->inode,
+				get_dtype(sb, fname->file_type));
+		if (error) {
+			filp->f_pos = curr_pos;
+			info->extra_fname = fname->next;
+			return error;
+		}
+		fname = fname->next;
+	}
+	return 0;
+}
+
+static int ext4_dx_readdir(struct file * filp,
+			 void * dirent, filldir_t filldir)
+{
+	struct dir_private_info *info = filp->private_data;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct fname *fname;
+	int	ret;
+
+	if (!info) {
+		info = create_dir_info(filp->f_pos);
+		if (!info)
+			return -ENOMEM;
+		filp->private_data = info;
+	}
+
+	if (filp->f_pos == EXT4_HTREE_EOF)
+		return 0;	/* EOF */
+
+	/* Some one has messed with f_pos; reset the world */
+	if (info->last_pos != filp->f_pos) {
+		free_rb_tree_fname(&info->root);
+		info->curr_node = NULL;
+		info->extra_fname = NULL;
+		info->curr_hash = pos2maj_hash(filp->f_pos);
+		info->curr_minor_hash = pos2min_hash(filp->f_pos);
+	}
+
+	/*
+	 * If there are any leftover names on the hash collision
+	 * chain, return them first.
+	 */
+	if (info->extra_fname &&
+	    call_filldir(filp, dirent, filldir, info->extra_fname))
+		goto finished;
+
+	if (!info->curr_node)
+		info->curr_node = rb_first(&info->root);
+
+	while (1) {
+		/*
+		 * Fill the rbtree if we have no more entries,
+		 * or the inode has changed since we last read in the
+		 * cached entries.
+		 */
+		if ((!info->curr_node) ||
+		    (filp->f_version != inode->i_version)) {
+			info->curr_node = NULL;
+			free_rb_tree_fname(&info->root);
+			filp->f_version = inode->i_version;
+			ret = ext4_htree_fill_tree(filp, info->curr_hash,
+						   info->curr_minor_hash,
+						   &info->next_hash);
+			if (ret < 0)
+				return ret;
+			if (ret == 0) {
+				filp->f_pos = EXT4_HTREE_EOF;
+				break;
+			}
+			info->curr_node = rb_first(&info->root);
+		}
+
+		fname = rb_entry(info->curr_node, struct fname, rb_hash);
+		info->curr_hash = fname->hash;
+		info->curr_minor_hash = fname->minor_hash;
+		if (call_filldir(filp, dirent, filldir, fname))
+			break;
+
+		info->curr_node = rb_next(info->curr_node);
+		if (!info->curr_node) {
+			if (info->next_hash == ~0) {
+				filp->f_pos = EXT4_HTREE_EOF;
+				break;
+			}
+			info->curr_hash = info->next_hash;
+			info->curr_minor_hash = 0;
+		}
+	}
+finished:
+	info->last_pos = filp->f_pos;
+	return 0;
+}
+
+static int ext4_release_dir (struct inode * inode, struct file * filp)
+{
+	if (filp->private_data)
+		ext4_htree_free_dir_info(filp->private_data);
+
+	return 0;
+}
+
+#endif
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
new file mode 100644
index 000000000000..2608dce18f3e
--- /dev/null
+++ b/fs/ext4/extents.c
@@ -0,0 +1,2152 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * Architecture independence:
+ *   Copyright (c) 2005, Bull S.A.
+ *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ */
+
+/*
+ * Extents support for EXT4
+ *
+ * TODO:
+ *   - ext4*_error() should be used in some situations
+ *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
+ *   - smart tree reduction
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/jbd.h>
+#include <linux/smp_lock.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/ext4_fs_extents.h>
+#include <asm/uaccess.h>
+
+
+/*
+ * ext_pblock:
+ * combine low and high parts of physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t ext_pblock(struct ext4_extent *ex)
+{
+	ext4_fsblk_t block;
+
+	block = le32_to_cpu(ex->ee_start);
+	block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1;
+	return block;
+}
+
+/*
+ * idx_pblock:
+ * combine low and high parts of a leaf physical block number into ext4_fsblk_t
+ */
+static inline ext4_fsblk_t idx_pblock(struct ext4_extent_idx *ix)
+{
+	ext4_fsblk_t block;
+
+	block = le32_to_cpu(ix->ei_leaf);
+	block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1;
+	return block;
+}
+
+/*
+ * ext4_ext_store_pblock:
+ * stores a large physical block number into an extent struct,
+ * breaking it into parts
+ */
+static inline void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb)
+{
+	ex->ee_start = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+	ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
+}
+
+/*
+ * ext4_idx_store_pblock:
+ * stores a large physical block number into an index struct,
+ * breaking it into parts
+ */
+static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
+{
+	ix->ei_leaf = cpu_to_le32((unsigned long) (pb & 0xffffffff));
+	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
+}
+
+static int ext4_ext_check_header(const char *function, struct inode *inode,
+				struct ext4_extent_header *eh)
+{
+	const char *error_msg = NULL;
+
+	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
+		error_msg = "invalid magic";
+		goto corrupted;
+	}
+	if (unlikely(eh->eh_max == 0)) {
+		error_msg = "invalid eh_max";
+		goto corrupted;
+	}
+	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
+		error_msg = "invalid eh_entries";
+		goto corrupted;
+	}
+	return 0;
+
+corrupted:
+	ext4_error(inode->i_sb, function,
+			"bad header in inode #%lu: %s - magic %x, "
+			"entries %u, max %u, depth %u",
+			inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
+			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
+			le16_to_cpu(eh->eh_depth));
+
+	return -EIO;
+}
+
+static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
+{
+	int err;
+
+	if (handle->h_buffer_credits > needed)
+		return handle;
+	if (!ext4_journal_extend(handle, needed))
+		return handle;
+	err = ext4_journal_restart(handle, needed);
+
+	return handle;
+}
+
+/*
+ * could return:
+ *  - EROFS
+ *  - ENOMEM
+ */
+static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
+				struct ext4_ext_path *path)
+{
+	if (path->p_bh) {
+		/* path points to block */
+		return ext4_journal_get_write_access(handle, path->p_bh);
+	}
+	/* path points to leaf/index in inode body */
+	/* we use in-core data, no need to protect them */
+	return 0;
+}
+
+/*
+ * could return:
+ *  - EROFS
+ *  - ENOMEM
+ *  - EIO
+ */
+static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
+				struct ext4_ext_path *path)
+{
+	int err;
+	if (path->p_bh) {
+		/* path points to block */
+		err = ext4_journal_dirty_metadata(handle, path->p_bh);
+	} else {
+		/* path points to leaf/index in inode body */
+		err = ext4_mark_inode_dirty(handle, inode);
+	}
+	return err;
+}
+
+static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
+			      struct ext4_ext_path *path,
+			      ext4_fsblk_t block)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	ext4_fsblk_t bg_start;
+	ext4_grpblk_t colour;
+	int depth;
+
+	if (path) {
+		struct ext4_extent *ex;
+		depth = path->p_depth;
+
+		/* try to predict block placement */
+		if ((ex = path[depth].p_ext))
+			return ext_pblock(ex)+(block-le32_to_cpu(ex->ee_block));
+
+		/* it looks like index is empty;
+		 * try to find starting block from index itself */
+		if (path[depth].p_bh)
+			return path[depth].p_bh->b_blocknr;
+	}
+
+	/* OK. use inode's group */
+	bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+		le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
+	colour = (current->pid % 16) *
+			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+	return bg_start + colour + block;
+}
+
+static ext4_fsblk_t
+ext4_ext_new_block(handle_t *handle, struct inode *inode,
+			struct ext4_ext_path *path,
+			struct ext4_extent *ex, int *err)
+{
+	ext4_fsblk_t goal, newblock;
+
+	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
+	newblock = ext4_new_block(handle, inode, goal, err);
+	return newblock;
+}
+
+static inline int ext4_ext_space_block(struct inode *inode)
+{
+	int size;
+
+	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
+			/ sizeof(struct ext4_extent);
+#ifdef AGRESSIVE_TEST
+	if (size > 6)
+		size = 6;
+#endif
+	return size;
+}
+
+static inline int ext4_ext_space_block_idx(struct inode *inode)
+{
+	int size;
+
+	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
+			/ sizeof(struct ext4_extent_idx);
+#ifdef AGRESSIVE_TEST
+	if (size > 5)
+		size = 5;
+#endif
+	return size;
+}
+
+static inline int ext4_ext_space_root(struct inode *inode)
+{
+	int size;
+
+	size = sizeof(EXT4_I(inode)->i_data);
+	size -= sizeof(struct ext4_extent_header);
+	size /= sizeof(struct ext4_extent);
+#ifdef AGRESSIVE_TEST
+	if (size > 3)
+		size = 3;
+#endif
+	return size;
+}
+
+static inline int ext4_ext_space_root_idx(struct inode *inode)
+{
+	int size;
+
+	size = sizeof(EXT4_I(inode)->i_data);
+	size -= sizeof(struct ext4_extent_header);
+	size /= sizeof(struct ext4_extent_idx);
+#ifdef AGRESSIVE_TEST
+	if (size > 4)
+		size = 4;
+#endif
+	return size;
+}
+
+#ifdef EXT_DEBUG
+static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
+{
+	int k, l = path->p_depth;
+
+	ext_debug("path:");
+	for (k = 0; k <= l; k++, path++) {
+		if (path->p_idx) {
+		  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
+			    idx_pblock(path->p_idx));
+		} else if (path->p_ext) {
+			ext_debug("  %d:%d:%llu ",
+				  le32_to_cpu(path->p_ext->ee_block),
+				  le16_to_cpu(path->p_ext->ee_len),
+				  ext_pblock(path->p_ext));
+		} else
+			ext_debug("  []");
+	}
+	ext_debug("\n");
+}
+
+static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
+{
+	int depth = ext_depth(inode);
+	struct ext4_extent_header *eh;
+	struct ext4_extent *ex;
+	int i;
+
+	if (!path)
+		return;
+
+	eh = path[depth].p_hdr;
+	ex = EXT_FIRST_EXTENT(eh);
+
+	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
+		ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
+			  le16_to_cpu(ex->ee_len), ext_pblock(ex));
+	}
+	ext_debug("\n");
+}
+#else
+#define ext4_ext_show_path(inode,path)
+#define ext4_ext_show_leaf(inode,path)
+#endif
+
+static void ext4_ext_drop_refs(struct ext4_ext_path *path)
+{
+	int depth = path->p_depth;
+	int i;
+
+	for (i = 0; i <= depth; i++, path++)
+		if (path->p_bh) {
+			brelse(path->p_bh);
+			path->p_bh = NULL;
+		}
+}
+
+/*
+ * ext4_ext_binsearch_idx:
+ * binary search for the closest index of the given block
+ */
+static void
+ext4_ext_binsearch_idx(struct inode *inode, struct ext4_ext_path *path, int block)
+{
+	struct ext4_extent_header *eh = path->p_hdr;
+	struct ext4_extent_idx *r, *l, *m;
+
+	BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
+	BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
+	BUG_ON(le16_to_cpu(eh->eh_entries) <= 0);
+
+	ext_debug("binsearch for %d(idx):  ", block);
+
+	l = EXT_FIRST_INDEX(eh) + 1;
+	r = EXT_FIRST_INDEX(eh) + le16_to_cpu(eh->eh_entries) - 1;
+	while (l <= r) {
+		m = l + (r - l) / 2;
+		if (block < le32_to_cpu(m->ei_block))
+			r = m - 1;
+		else
+			l = m + 1;
+		ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ei_block,
+				m, m->ei_block, r, r->ei_block);
+	}
+
+	path->p_idx = l - 1;
+	ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
+		  idx_block(path->p_idx));
+
+#ifdef CHECK_BINSEARCH
+	{
+		struct ext4_extent_idx *chix, *ix;
+		int k;
+
+		chix = ix = EXT_FIRST_INDEX(eh);
+		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
+		  if (k != 0 &&
+		      le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
+				printk("k=%d, ix=0x%p, first=0x%p\n", k,
+					ix, EXT_FIRST_INDEX(eh));
+				printk("%u <= %u\n",
+				       le32_to_cpu(ix->ei_block),
+				       le32_to_cpu(ix[-1].ei_block));
+			}
+			BUG_ON(k && le32_to_cpu(ix->ei_block)
+				           <= le32_to_cpu(ix[-1].ei_block));
+			if (block < le32_to_cpu(ix->ei_block))
+				break;
+			chix = ix;
+		}
+		BUG_ON(chix != path->p_idx);
+	}
+#endif
+
+}
+
+/*
+ * ext4_ext_binsearch:
+ * binary search for closest extent of the given block
+ */
+static void
+ext4_ext_binsearch(struct inode *inode, struct ext4_ext_path *path, int block)
+{
+	struct ext4_extent_header *eh = path->p_hdr;
+	struct ext4_extent *r, *l, *m;
+
+	BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
+	BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
+
+	if (eh->eh_entries == 0) {
+		/*
+		 * this leaf is empty:
+		 * we get such a leaf in split/add case
+		 */
+		return;
+	}
+
+	ext_debug("binsearch for %d:  ", block);
+
+	l = EXT_FIRST_EXTENT(eh) + 1;
+	r = EXT_FIRST_EXTENT(eh) + le16_to_cpu(eh->eh_entries) - 1;
+
+	while (l <= r) {
+		m = l + (r - l) / 2;
+		if (block < le32_to_cpu(m->ee_block))
+			r = m - 1;
+		else
+			l = m + 1;
+		ext_debug("%p(%u):%p(%u):%p(%u) ", l, l->ee_block,
+				m, m->ee_block, r, r->ee_block);
+	}
+
+	path->p_ext = l - 1;
+	ext_debug("  -> %d:%llu:%d ",
+		        le32_to_cpu(path->p_ext->ee_block),
+		        ext_pblock(path->p_ext),
+			le16_to_cpu(path->p_ext->ee_len));
+
+#ifdef CHECK_BINSEARCH
+	{
+		struct ext4_extent *chex, *ex;
+		int k;
+
+		chex = ex = EXT_FIRST_EXTENT(eh);
+		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
+			BUG_ON(k && le32_to_cpu(ex->ee_block)
+				          <= le32_to_cpu(ex[-1].ee_block));
+			if (block < le32_to_cpu(ex->ee_block))
+				break;
+			chex = ex;
+		}
+		BUG_ON(chex != path->p_ext);
+	}
+#endif
+
+}
+
+int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
+{
+	struct ext4_extent_header *eh;
+
+	eh = ext_inode_hdr(inode);
+	eh->eh_depth = 0;
+	eh->eh_entries = 0;
+	eh->eh_magic = EXT4_EXT_MAGIC;
+	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
+	ext4_mark_inode_dirty(handle, inode);
+	ext4_ext_invalidate_cache(inode);
+	return 0;
+}
+
+struct ext4_ext_path *
+ext4_ext_find_extent(struct inode *inode, int block, struct ext4_ext_path *path)
+{
+	struct ext4_extent_header *eh;
+	struct buffer_head *bh;
+	short int depth, i, ppos = 0, alloc = 0;
+
+	eh = ext_inode_hdr(inode);
+	BUG_ON(eh == NULL);
+	if (ext4_ext_check_header(__FUNCTION__, inode, eh))
+		return ERR_PTR(-EIO);
+
+	i = depth = ext_depth(inode);
+
+	/* account possible depth increase */
+	if (!path) {
+		path = kmalloc(sizeof(struct ext4_ext_path) * (depth + 2),
+				GFP_NOFS);
+		if (!path)
+			return ERR_PTR(-ENOMEM);
+		alloc = 1;
+	}
+	memset(path, 0, sizeof(struct ext4_ext_path) * (depth + 1));
+	path[0].p_hdr = eh;
+
+	/* walk through the tree */
+	while (i) {
+		ext_debug("depth %d: num %d, max %d\n",
+			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
+		ext4_ext_binsearch_idx(inode, path + ppos, block);
+		path[ppos].p_block = idx_pblock(path[ppos].p_idx);
+		path[ppos].p_depth = i;
+		path[ppos].p_ext = NULL;
+
+		bh = sb_bread(inode->i_sb, path[ppos].p_block);
+		if (!bh)
+			goto err;
+
+		eh = ext_block_hdr(bh);
+		ppos++;
+		BUG_ON(ppos > depth);
+		path[ppos].p_bh = bh;
+		path[ppos].p_hdr = eh;
+		i--;
+
+		if (ext4_ext_check_header(__FUNCTION__, inode, eh))
+			goto err;
+	}
+
+	path[ppos].p_depth = i;
+	path[ppos].p_hdr = eh;
+	path[ppos].p_ext = NULL;
+	path[ppos].p_idx = NULL;
+
+	if (ext4_ext_check_header(__FUNCTION__, inode, eh))
+		goto err;
+
+	/* find extent */
+	ext4_ext_binsearch(inode, path + ppos, block);
+
+	ext4_ext_show_path(inode, path);
+
+	return path;
+
+err:
+	ext4_ext_drop_refs(path);
+	if (alloc)
+		kfree(path);
+	return ERR_PTR(-EIO);
+}
+
+/*
+ * ext4_ext_insert_index:
+ * insert new index [@logical;@ptr] into the block at @curp;
+ * check where to insert: before @curp or after @curp
+ */
+static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+				struct ext4_ext_path *curp,
+				int logical, ext4_fsblk_t ptr)
+{
+	struct ext4_extent_idx *ix;
+	int len, err;
+
+	if ((err = ext4_ext_get_access(handle, inode, curp)))
+		return err;
+
+	BUG_ON(logical == le32_to_cpu(curp->p_idx->ei_block));
+	len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
+	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
+		/* insert after */
+		if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) {
+			len = (len - 1) * sizeof(struct ext4_extent_idx);
+			len = len < 0 ? 0 : len;
+			ext_debug("insert new index %d after: %d. "
+					"move %d from 0x%p to 0x%p\n",
+					logical, ptr, len,
+					(curp->p_idx + 1), (curp->p_idx + 2));
+			memmove(curp->p_idx + 2, curp->p_idx + 1, len);
+		}
+		ix = curp->p_idx + 1;
+	} else {
+		/* insert before */
+		len = len * sizeof(struct ext4_extent_idx);
+		len = len < 0 ? 0 : len;
+		ext_debug("insert new index %d before: %d. "
+				"move %d from 0x%p to 0x%p\n",
+				logical, ptr, len,
+				curp->p_idx, (curp->p_idx + 1));
+		memmove(curp->p_idx + 1, curp->p_idx, len);
+		ix = curp->p_idx;
+	}
+
+	ix->ei_block = cpu_to_le32(logical);
+	ext4_idx_store_pblock(ix, ptr);
+	curp->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(curp->p_hdr->eh_entries)+1);
+
+	BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries)
+	                     > le16_to_cpu(curp->p_hdr->eh_max));
+	BUG_ON(ix > EXT_LAST_INDEX(curp->p_hdr));
+
+	err = ext4_ext_dirty(handle, inode, curp);
+	ext4_std_error(inode->i_sb, err);
+
+	return err;
+}
+
+/*
+ * ext4_ext_split:
+ * inserts new subtree into the path, using free index entry
+ * at depth @at:
+ * - allocates all needed blocks (new leaf and all intermediate index blocks)
+ * - makes decision where to split
+ * - moves remaining extents and index entries (right to the split point)
+ *   into the newly allocated blocks
+ * - initializes subtree
+ */
+static int ext4_ext_split(handle_t *handle, struct inode *inode,
+				struct ext4_ext_path *path,
+				struct ext4_extent *newext, int at)
+{
+	struct buffer_head *bh = NULL;
+	int depth = ext_depth(inode);
+	struct ext4_extent_header *neh;
+	struct ext4_extent_idx *fidx;
+	struct ext4_extent *ex;
+	int i = at, k, m, a;
+	ext4_fsblk_t newblock, oldblock;
+	__le32 border;
+	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
+	int err = 0;
+
+	/* make decision: where to split? */
+	/* FIXME: now decision is simplest: at current extent */
+
+	/* if current leaf will be split, then we should use
+	 * border from split point */
+	BUG_ON(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr));
+	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
+		border = path[depth].p_ext[1].ee_block;
+		ext_debug("leaf will be split."
+				" next leaf starts at %d\n",
+			          le32_to_cpu(border));
+	} else {
+		border = newext->ee_block;
+		ext_debug("leaf will be added."
+				" next leaf starts at %d\n",
+			        le32_to_cpu(border));
+	}
+
+	/*
+	 * If error occurs, then we break processing
+	 * and mark filesystem read-only. index won't
+	 * be inserted and tree will be in consistent
+	 * state. Next mount will repair buffers too.
+	 */
+
+	/*
+	 * Get array to track all allocated blocks.
+	 * We need this to handle errors and free blocks
+	 * upon them.
+	 */
+	ablocks = kmalloc(sizeof(ext4_fsblk_t) * depth, GFP_NOFS);
+	if (!ablocks)
+		return -ENOMEM;
+	memset(ablocks, 0, sizeof(ext4_fsblk_t) * depth);
+
+	/* allocate all needed blocks */
+	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
+	for (a = 0; a < depth - at; a++) {
+		newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+		if (newblock == 0)
+			goto cleanup;
+		ablocks[a] = newblock;
+	}
+
+	/* initialize new leaf */
+	newblock = ablocks[--a];
+	BUG_ON(newblock == 0);
+	bh = sb_getblk(inode->i_sb, newblock);
+	if (!bh) {
+		err = -EIO;
+		goto cleanup;
+	}
+	lock_buffer(bh);
+
+	if ((err = ext4_journal_get_create_access(handle, bh)))
+		goto cleanup;
+
+	neh = ext_block_hdr(bh);
+	neh->eh_entries = 0;
+	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+	neh->eh_magic = EXT4_EXT_MAGIC;
+	neh->eh_depth = 0;
+	ex = EXT_FIRST_EXTENT(neh);
+
+	/* move remainder of path[depth] to the new leaf */
+	BUG_ON(path[depth].p_hdr->eh_entries != path[depth].p_hdr->eh_max);
+	/* start copy from next extent */
+	/* TODO: we could do it by single memmove */
+	m = 0;
+	path[depth].p_ext++;
+	while (path[depth].p_ext <=
+			EXT_MAX_EXTENT(path[depth].p_hdr)) {
+		ext_debug("move %d:%llu:%d in new leaf %llu\n",
+			        le32_to_cpu(path[depth].p_ext->ee_block),
+			        ext_pblock(path[depth].p_ext),
+			        le16_to_cpu(path[depth].p_ext->ee_len),
+				newblock);
+		/*memmove(ex++, path[depth].p_ext++,
+				sizeof(struct ext4_extent));
+		neh->eh_entries++;*/
+		path[depth].p_ext++;
+		m++;
+	}
+	if (m) {
+		memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
+		neh->eh_entries = cpu_to_le16(le16_to_cpu(neh->eh_entries)+m);
+	}
+
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+
+	if ((err = ext4_journal_dirty_metadata(handle, bh)))
+		goto cleanup;
+	brelse(bh);
+	bh = NULL;
+
+	/* correct old leaf */
+	if (m) {
+		if ((err = ext4_ext_get_access(handle, inode, path + depth)))
+			goto cleanup;
+		path[depth].p_hdr->eh_entries =
+		     cpu_to_le16(le16_to_cpu(path[depth].p_hdr->eh_entries)-m);
+		if ((err = ext4_ext_dirty(handle, inode, path + depth)))
+			goto cleanup;
+
+	}
+
+	/* create intermediate indexes */
+	k = depth - at - 1;
+	BUG_ON(k < 0);
+	if (k)
+		ext_debug("create %d intermediate indices\n", k);
+	/* insert new index into current index block */
+	/* current depth stored in i var */
+	i = depth - 1;
+	while (k--) {
+		oldblock = newblock;
+		newblock = ablocks[--a];
+		bh = sb_getblk(inode->i_sb, (ext4_fsblk_t)newblock);
+		if (!bh) {
+			err = -EIO;
+			goto cleanup;
+		}
+		lock_buffer(bh);
+
+		if ((err = ext4_journal_get_create_access(handle, bh)))
+			goto cleanup;
+
+		neh = ext_block_hdr(bh);
+		neh->eh_entries = cpu_to_le16(1);
+		neh->eh_magic = EXT4_EXT_MAGIC;
+		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+		neh->eh_depth = cpu_to_le16(depth - i);
+		fidx = EXT_FIRST_INDEX(neh);
+		fidx->ei_block = border;
+		ext4_idx_store_pblock(fidx, oldblock);
+
+		ext_debug("int.index at %d (block %llu): %lu -> %llu\n", i,
+				newblock, (unsigned long) le32_to_cpu(border),
+				oldblock);
+		/* copy indexes */
+		m = 0;
+		path[i].p_idx++;
+
+		ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
+				EXT_MAX_INDEX(path[i].p_hdr));
+		BUG_ON(EXT_MAX_INDEX(path[i].p_hdr) !=
+				EXT_LAST_INDEX(path[i].p_hdr));
+		while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
+			ext_debug("%d: move %d:%d in new index %llu\n", i,
+				        le32_to_cpu(path[i].p_idx->ei_block),
+				        idx_pblock(path[i].p_idx),
+				        newblock);
+			/*memmove(++fidx, path[i].p_idx++,
+					sizeof(struct ext4_extent_idx));
+			neh->eh_entries++;
+			BUG_ON(neh->eh_entries > neh->eh_max);*/
+			path[i].p_idx++;
+			m++;
+		}
+		if (m) {
+			memmove(++fidx, path[i].p_idx - m,
+				sizeof(struct ext4_extent_idx) * m);
+			neh->eh_entries =
+				cpu_to_le16(le16_to_cpu(neh->eh_entries) + m);
+		}
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+
+		if ((err = ext4_journal_dirty_metadata(handle, bh)))
+			goto cleanup;
+		brelse(bh);
+		bh = NULL;
+
+		/* correct old index */
+		if (m) {
+			err = ext4_ext_get_access(handle, inode, path + i);
+			if (err)
+				goto cleanup;
+			path[i].p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path[i].p_hdr->eh_entries)-m);
+			err = ext4_ext_dirty(handle, inode, path + i);
+			if (err)
+				goto cleanup;
+		}
+
+		i--;
+	}
+
+	/* insert new index */
+	if (err)
+		goto cleanup;
+
+	err = ext4_ext_insert_index(handle, inode, path + at,
+				    le32_to_cpu(border), newblock);
+
+cleanup:
+	if (bh) {
+		if (buffer_locked(bh))
+			unlock_buffer(bh);
+		brelse(bh);
+	}
+
+	if (err) {
+		/* free all allocated blocks in error case */
+		for (i = 0; i < depth; i++) {
+			if (!ablocks[i])
+				continue;
+			ext4_free_blocks(handle, inode, ablocks[i], 1);
+		}
+	}
+	kfree(ablocks);
+
+	return err;
+}
+
+/*
+ * ext4_ext_grow_indepth:
+ * implements tree growing procedure:
+ * - allocates new block
+ * - moves top-level data (index block or leaf) into the new block
+ * - initializes new top-level, creating index that points to the
+ *   just created block
+ */
+static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
+					struct ext4_ext_path *path,
+					struct ext4_extent *newext)
+{
+	struct ext4_ext_path *curp = path;
+	struct ext4_extent_header *neh;
+	struct ext4_extent_idx *fidx;
+	struct buffer_head *bh;
+	ext4_fsblk_t newblock;
+	int err = 0;
+
+	newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+	if (newblock == 0)
+		return err;
+
+	bh = sb_getblk(inode->i_sb, newblock);
+	if (!bh) {
+		err = -EIO;
+		ext4_std_error(inode->i_sb, err);
+		return err;
+	}
+	lock_buffer(bh);
+
+	if ((err = ext4_journal_get_create_access(handle, bh))) {
+		unlock_buffer(bh);
+		goto out;
+	}
+
+	/* move top-level index/leaf into new block */
+	memmove(bh->b_data, curp->p_hdr, sizeof(EXT4_I(inode)->i_data));
+
+	/* set size of new block */
+	neh = ext_block_hdr(bh);
+	/* old root could have indexes or leaves
+	 * so calculate e_max right way */
+	if (ext_depth(inode))
+	  neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+	else
+	  neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+	neh->eh_magic = EXT4_EXT_MAGIC;
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+
+	if ((err = ext4_journal_dirty_metadata(handle, bh)))
+		goto out;
+
+	/* create index in new top-level index: num,max,pointer */
+	if ((err = ext4_ext_get_access(handle, inode, curp)))
+		goto out;
+
+	curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
+	curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
+	curp->p_hdr->eh_entries = cpu_to_le16(1);
+	curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
+	/* FIXME: it works, but actually path[0] can be index */
+	curp->p_idx->ei_block = EXT_FIRST_EXTENT(path[0].p_hdr)->ee_block;
+	ext4_idx_store_pblock(curp->p_idx, newblock);
+
+	neh = ext_inode_hdr(inode);
+	fidx = EXT_FIRST_INDEX(neh);
+	ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
+		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
+		  le32_to_cpu(fidx->ei_block), idx_pblock(fidx));
+
+	neh->eh_depth = cpu_to_le16(path->p_depth + 1);
+	err = ext4_ext_dirty(handle, inode, curp);
+out:
+	brelse(bh);
+
+	return err;
+}
+
+/*
+ * ext4_ext_create_new_leaf:
+ * finds empty index and adds new leaf.
+ * if no free index is found, then it requests in-depth growing.
+ */
+static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
+					struct ext4_ext_path *path,
+					struct ext4_extent *newext)
+{
+	struct ext4_ext_path *curp;
+	int depth, i, err = 0;
+
+repeat:
+	i = depth = ext_depth(inode);
+
+	/* walk up to the tree and look for free index entry */
+	curp = path + depth;
+	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
+		i--;
+		curp--;
+	}
+
+	/* we use already allocated block for index block,
+	 * so subsequent data blocks should be contiguous */
+	if (EXT_HAS_FREE_INDEX(curp)) {
+		/* if we found index with free entry, then use that
+		 * entry: create all needed subtree and add new leaf */
+		err = ext4_ext_split(handle, inode, path, newext, i);
+
+		/* refill path */
+		ext4_ext_drop_refs(path);
+		path = ext4_ext_find_extent(inode,
+					    le32_to_cpu(newext->ee_block),
+					    path);
+		if (IS_ERR(path))
+			err = PTR_ERR(path);
+	} else {
+		/* tree is full, time to grow in depth */
+		err = ext4_ext_grow_indepth(handle, inode, path, newext);
+		if (err)
+			goto out;
+
+		/* refill path */
+		ext4_ext_drop_refs(path);
+		path = ext4_ext_find_extent(inode,
+					    le32_to_cpu(newext->ee_block),
+					    path);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			goto out;
+		}
+
+		/*
+		 * only first (depth 0 -> 1) produces free space;
+		 * in all other cases we have to split the grown tree
+		 */
+		depth = ext_depth(inode);
+		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
+			/* now we need to split */
+			goto repeat;
+		}
+	}
+
+out:
+	return err;
+}
+
+/*
+ * ext4_ext_next_allocated_block:
+ * returns allocated block in subsequent extent or EXT_MAX_BLOCK.
+ * NOTE: it considers block number from index entry as
+ * allocated block. Thus, index entries have to be consistent
+ * with leaves.
+ */
+static unsigned long
+ext4_ext_next_allocated_block(struct ext4_ext_path *path)
+{
+	int depth;
+
+	BUG_ON(path == NULL);
+	depth = path->p_depth;
+
+	if (depth == 0 && path->p_ext == NULL)
+		return EXT_MAX_BLOCK;
+
+	while (depth >= 0) {
+		if (depth == path->p_depth) {
+			/* leaf */
+			if (path[depth].p_ext !=
+					EXT_LAST_EXTENT(path[depth].p_hdr))
+			  return le32_to_cpu(path[depth].p_ext[1].ee_block);
+		} else {
+			/* index */
+			if (path[depth].p_idx !=
+					EXT_LAST_INDEX(path[depth].p_hdr))
+			  return le32_to_cpu(path[depth].p_idx[1].ei_block);
+		}
+		depth--;
+	}
+
+	return EXT_MAX_BLOCK;
+}
+
+/*
+ * ext4_ext_next_leaf_block:
+ * returns first allocated block from next leaf or EXT_MAX_BLOCK
+ */
+static unsigned ext4_ext_next_leaf_block(struct inode *inode,
+					struct ext4_ext_path *path)
+{
+	int depth;
+
+	BUG_ON(path == NULL);
+	depth = path->p_depth;
+
+	/* zero-tree has no leaf blocks at all */
+	if (depth == 0)
+		return EXT_MAX_BLOCK;
+
+	/* go to index block */
+	depth--;
+
+	while (depth >= 0) {
+		if (path[depth].p_idx !=
+				EXT_LAST_INDEX(path[depth].p_hdr))
+		  return le32_to_cpu(path[depth].p_idx[1].ei_block);
+		depth--;
+	}
+
+	return EXT_MAX_BLOCK;
+}
+
+/*
+ * ext4_ext_correct_indexes:
+ * if leaf gets modified and modified extent is first in the leaf,
+ * then we have to correct all indexes above.
+ * TODO: do we need to correct tree in all cases?
+ */
+int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
+				struct ext4_ext_path *path)
+{
+	struct ext4_extent_header *eh;
+	int depth = ext_depth(inode);
+	struct ext4_extent *ex;
+	__le32 border;
+	int k, err = 0;
+
+	eh = path[depth].p_hdr;
+	ex = path[depth].p_ext;
+	BUG_ON(ex == NULL);
+	BUG_ON(eh == NULL);
+
+	if (depth == 0) {
+		/* there is no tree at all */
+		return 0;
+	}
+
+	if (ex != EXT_FIRST_EXTENT(eh)) {
+		/* we correct tree if first leaf got modified only */
+		return 0;
+	}
+
+	/*
+	 * TODO: we need correction if border is smaller than current one
+	 */
+	k = depth - 1;
+	border = path[depth].p_ext->ee_block;
+	if ((err = ext4_ext_get_access(handle, inode, path + k)))
+		return err;
+	path[k].p_idx->ei_block = border;
+	if ((err = ext4_ext_dirty(handle, inode, path + k)))
+		return err;
+
+	while (k--) {
+		/* change all left-side indexes */
+		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
+			break;
+		if ((err = ext4_ext_get_access(handle, inode, path + k)))
+			break;
+		path[k].p_idx->ei_block = border;
+		if ((err = ext4_ext_dirty(handle, inode, path + k)))
+			break;
+	}
+
+	return err;
+}
+
+static int inline
+ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
+				struct ext4_extent *ex2)
+{
+	if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) !=
+			le32_to_cpu(ex2->ee_block))
+		return 0;
+
+	/*
+	 * To allow future support for preallocated extents to be added
+	 * as an RO_COMPAT feature, refuse to merge to extents if
+	 * this can result in the top bit of ee_len being set.
+	 */
+	if (le16_to_cpu(ex1->ee_len) + le16_to_cpu(ex2->ee_len) > EXT_MAX_LEN)
+		return 0;
+#ifdef AGRESSIVE_TEST
+	if (le16_to_cpu(ex1->ee_len) >= 4)
+		return 0;
+#endif
+
+	if (ext_pblock(ex1) + le16_to_cpu(ex1->ee_len) == ext_pblock(ex2))
+		return 1;
+	return 0;
+}
+
+/*
+ * ext4_ext_insert_extent:
+ * tries to merge requsted extent into the existing extent or
+ * inserts requested extent as new one into the tree,
+ * creating new leaf in the no-space case.
+ */
+int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
+				struct ext4_ext_path *path,
+				struct ext4_extent *newext)
+{
+	struct ext4_extent_header * eh;
+	struct ext4_extent *ex, *fex;
+	struct ext4_extent *nearex; /* nearest extent */
+	struct ext4_ext_path *npath = NULL;
+	int depth, len, err, next;
+
+	BUG_ON(newext->ee_len == 0);
+	depth = ext_depth(inode);
+	ex = path[depth].p_ext;
+	BUG_ON(path[depth].p_hdr == NULL);
+
+	/* try to insert block into found extent and return */
+	if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
+		ext_debug("append %d block to %d:%d (from %llu)\n",
+				le16_to_cpu(newext->ee_len),
+				le32_to_cpu(ex->ee_block),
+				le16_to_cpu(ex->ee_len), ext_pblock(ex));
+		if ((err = ext4_ext_get_access(handle, inode, path + depth)))
+			return err;
+		ex->ee_len = cpu_to_le16(le16_to_cpu(ex->ee_len)
+					 + le16_to_cpu(newext->ee_len));
+		eh = path[depth].p_hdr;
+		nearex = ex;
+		goto merge;
+	}
+
+repeat:
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
+		goto has_space;
+
+	/* probably next leaf has space for us? */
+	fex = EXT_LAST_EXTENT(eh);
+	next = ext4_ext_next_leaf_block(inode, path);
+	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)
+	    && next != EXT_MAX_BLOCK) {
+		ext_debug("next leaf block - %d\n", next);
+		BUG_ON(npath != NULL);
+		npath = ext4_ext_find_extent(inode, next, NULL);
+		if (IS_ERR(npath))
+			return PTR_ERR(npath);
+		BUG_ON(npath->p_depth != path->p_depth);
+		eh = npath[depth].p_hdr;
+		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
+			ext_debug("next leaf isnt full(%d)\n",
+				  le16_to_cpu(eh->eh_entries));
+			path = npath;
+			goto repeat;
+		}
+		ext_debug("next leaf has no free space(%d,%d)\n",
+			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
+	}
+
+	/*
+	 * There is no free space in the found leaf.
+	 * We're gonna add a new leaf in the tree.
+	 */
+	err = ext4_ext_create_new_leaf(handle, inode, path, newext);
+	if (err)
+		goto cleanup;
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+
+has_space:
+	nearex = path[depth].p_ext;
+
+	if ((err = ext4_ext_get_access(handle, inode, path + depth)))
+		goto cleanup;
+
+	if (!nearex) {
+		/* there is no extent in this leaf, create first one */
+		ext_debug("first extent in the leaf: %d:%llu:%d\n",
+			        le32_to_cpu(newext->ee_block),
+			        ext_pblock(newext),
+			        le16_to_cpu(newext->ee_len));
+		path[depth].p_ext = EXT_FIRST_EXTENT(eh);
+	} else if (le32_to_cpu(newext->ee_block)
+		           > le32_to_cpu(nearex->ee_block)) {
+/*		BUG_ON(newext->ee_block == nearex->ee_block); */
+		if (nearex != EXT_LAST_EXTENT(eh)) {
+			len = EXT_MAX_EXTENT(eh) - nearex;
+			len = (len - 1) * sizeof(struct ext4_extent);
+			len = len < 0 ? 0 : len;
+			ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
+					"move %d from 0x%p to 0x%p\n",
+				        le32_to_cpu(newext->ee_block),
+				        ext_pblock(newext),
+				        le16_to_cpu(newext->ee_len),
+					nearex, len, nearex + 1, nearex + 2);
+			memmove(nearex + 2, nearex + 1, len);
+		}
+		path[depth].p_ext = nearex + 1;
+	} else {
+		BUG_ON(newext->ee_block == nearex->ee_block);
+		len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
+		len = len < 0 ? 0 : len;
+		ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
+				"move %d from 0x%p to 0x%p\n",
+				le32_to_cpu(newext->ee_block),
+				ext_pblock(newext),
+				le16_to_cpu(newext->ee_len),
+				nearex, len, nearex + 1, nearex + 2);
+		memmove(nearex + 1, nearex, len);
+		path[depth].p_ext = nearex;
+	}
+
+	eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)+1);
+	nearex = path[depth].p_ext;
+	nearex->ee_block = newext->ee_block;
+	nearex->ee_start = newext->ee_start;
+	nearex->ee_start_hi = newext->ee_start_hi;
+	nearex->ee_len = newext->ee_len;
+
+merge:
+	/* try to merge extents to the right */
+	while (nearex < EXT_LAST_EXTENT(eh)) {
+		if (!ext4_can_extents_be_merged(inode, nearex, nearex + 1))
+			break;
+		/* merge with next extent! */
+		nearex->ee_len = cpu_to_le16(le16_to_cpu(nearex->ee_len)
+					     + le16_to_cpu(nearex[1].ee_len));
+		if (nearex + 1 < EXT_LAST_EXTENT(eh)) {
+			len = (EXT_LAST_EXTENT(eh) - nearex - 1)
+					* sizeof(struct ext4_extent);
+			memmove(nearex + 1, nearex + 2, len);
+		}
+		eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
+		BUG_ON(eh->eh_entries == 0);
+	}
+
+	/* try to merge extents to the left */
+
+	/* time to correct all indexes above */
+	err = ext4_ext_correct_indexes(handle, inode, path);
+	if (err)
+		goto cleanup;
+
+	err = ext4_ext_dirty(handle, inode, path + depth);
+
+cleanup:
+	if (npath) {
+		ext4_ext_drop_refs(npath);
+		kfree(npath);
+	}
+	ext4_ext_tree_changed(inode);
+	ext4_ext_invalidate_cache(inode);
+	return err;
+}
+
+int ext4_ext_walk_space(struct inode *inode, unsigned long block,
+			unsigned long num, ext_prepare_callback func,
+			void *cbdata)
+{
+	struct ext4_ext_path *path = NULL;
+	struct ext4_ext_cache cbex;
+	struct ext4_extent *ex;
+	unsigned long next, start = 0, end = 0;
+	unsigned long last = block + num;
+	int depth, exists, err = 0;
+
+	BUG_ON(func == NULL);
+	BUG_ON(inode == NULL);
+
+	while (block < last && block != EXT_MAX_BLOCK) {
+		num = last - block;
+		/* find extent for this block */
+		path = ext4_ext_find_extent(inode, block, path);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			path = NULL;
+			break;
+		}
+
+		depth = ext_depth(inode);
+		BUG_ON(path[depth].p_hdr == NULL);
+		ex = path[depth].p_ext;
+		next = ext4_ext_next_allocated_block(path);
+
+		exists = 0;
+		if (!ex) {
+			/* there is no extent yet, so try to allocate
+			 * all requested space */
+			start = block;
+			end = block + num;
+		} else if (le32_to_cpu(ex->ee_block) > block) {
+			/* need to allocate space before found extent */
+			start = block;
+			end = le32_to_cpu(ex->ee_block);
+			if (block + num < end)
+				end = block + num;
+		} else if (block >=
+			     le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len)) {
+			/* need to allocate space after found extent */
+			start = block;
+			end = block + num;
+			if (end >= next)
+				end = next;
+		} else if (block >= le32_to_cpu(ex->ee_block)) {
+			/*
+			 * some part of requested space is covered
+			 * by found extent
+			 */
+			start = block;
+			end = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len);
+			if (block + num < end)
+				end = block + num;
+			exists = 1;
+		} else {
+			BUG();
+		}
+		BUG_ON(end <= start);
+
+		if (!exists) {
+			cbex.ec_block = start;
+			cbex.ec_len = end - start;
+			cbex.ec_start = 0;
+			cbex.ec_type = EXT4_EXT_CACHE_GAP;
+		} else {
+		        cbex.ec_block = le32_to_cpu(ex->ee_block);
+		        cbex.ec_len = le16_to_cpu(ex->ee_len);
+		        cbex.ec_start = ext_pblock(ex);
+			cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
+		}
+
+		BUG_ON(cbex.ec_len == 0);
+		err = func(inode, path, &cbex, cbdata);
+		ext4_ext_drop_refs(path);
+
+		if (err < 0)
+			break;
+		if (err == EXT_REPEAT)
+			continue;
+		else if (err == EXT_BREAK) {
+			err = 0;
+			break;
+		}
+
+		if (ext_depth(inode) != depth) {
+			/* depth was changed. we have to realloc path */
+			kfree(path);
+			path = NULL;
+		}
+
+		block = cbex.ec_block + cbex.ec_len;
+	}
+
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+
+	return err;
+}
+
+static inline void
+ext4_ext_put_in_cache(struct inode *inode, __u32 block,
+			__u32 len, __u32 start, int type)
+{
+	struct ext4_ext_cache *cex;
+	BUG_ON(len == 0);
+	cex = &EXT4_I(inode)->i_cached_extent;
+	cex->ec_type = type;
+	cex->ec_block = block;
+	cex->ec_len = len;
+	cex->ec_start = start;
+}
+
+/*
+ * ext4_ext_put_gap_in_cache:
+ * calculate boundaries of the gap that the requested block fits into
+ * and cache this gap
+ */
+static inline void
+ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
+				unsigned long block)
+{
+	int depth = ext_depth(inode);
+	unsigned long lblock, len;
+	struct ext4_extent *ex;
+
+	ex = path[depth].p_ext;
+	if (ex == NULL) {
+		/* there is no extent yet, so gap is [0;-] */
+		lblock = 0;
+		len = EXT_MAX_BLOCK;
+		ext_debug("cache gap(whole file):");
+	} else if (block < le32_to_cpu(ex->ee_block)) {
+		lblock = block;
+		len = le32_to_cpu(ex->ee_block) - block;
+		ext_debug("cache gap(before): %lu [%lu:%lu]",
+				(unsigned long) block,
+			        (unsigned long) le32_to_cpu(ex->ee_block),
+			        (unsigned long) le16_to_cpu(ex->ee_len));
+	} else if (block >= le32_to_cpu(ex->ee_block)
+		            + le16_to_cpu(ex->ee_len)) {
+	        lblock = le32_to_cpu(ex->ee_block)
+		         + le16_to_cpu(ex->ee_len);
+		len = ext4_ext_next_allocated_block(path);
+		ext_debug("cache gap(after): [%lu:%lu] %lu",
+			        (unsigned long) le32_to_cpu(ex->ee_block),
+			        (unsigned long) le16_to_cpu(ex->ee_len),
+				(unsigned long) block);
+		BUG_ON(len == lblock);
+		len = len - lblock;
+	} else {
+		lblock = len = 0;
+		BUG();
+	}
+
+	ext_debug(" -> %lu:%lu\n", (unsigned long) lblock, len);
+	ext4_ext_put_in_cache(inode, lblock, len, 0, EXT4_EXT_CACHE_GAP);
+}
+
+static inline int
+ext4_ext_in_cache(struct inode *inode, unsigned long block,
+			struct ext4_extent *ex)
+{
+	struct ext4_ext_cache *cex;
+
+	cex = &EXT4_I(inode)->i_cached_extent;
+
+	/* has cache valid data? */
+	if (cex->ec_type == EXT4_EXT_CACHE_NO)
+		return EXT4_EXT_CACHE_NO;
+
+	BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
+			cex->ec_type != EXT4_EXT_CACHE_EXTENT);
+	if (block >= cex->ec_block && block < cex->ec_block + cex->ec_len) {
+	        ex->ee_block = cpu_to_le32(cex->ec_block);
+		ext4_ext_store_pblock(ex, cex->ec_start);
+	        ex->ee_len = cpu_to_le16(cex->ec_len);
+		ext_debug("%lu cached by %lu:%lu:%llu\n",
+				(unsigned long) block,
+				(unsigned long) cex->ec_block,
+				(unsigned long) cex->ec_len,
+				cex->ec_start);
+		return cex->ec_type;
+	}
+
+	/* not in cache */
+	return EXT4_EXT_CACHE_NO;
+}
+
+/*
+ * ext4_ext_rm_idx:
+ * removes index from the index block.
+ * It's used in truncate case only, thus all requests are for
+ * last index in the block only.
+ */
+int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
+			struct ext4_ext_path *path)
+{
+	struct buffer_head *bh;
+	int err;
+	ext4_fsblk_t leaf;
+
+	/* free index block */
+	path--;
+	leaf = idx_pblock(path->p_idx);
+	BUG_ON(path->p_hdr->eh_entries == 0);
+	if ((err = ext4_ext_get_access(handle, inode, path)))
+		return err;
+	path->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path->p_hdr->eh_entries)-1);
+	if ((err = ext4_ext_dirty(handle, inode, path)))
+		return err;
+	ext_debug("index is empty, remove it, free block %llu\n", leaf);
+	bh = sb_find_get_block(inode->i_sb, leaf);
+	ext4_forget(handle, 1, inode, bh, leaf);
+	ext4_free_blocks(handle, inode, leaf, 1);
+	return err;
+}
+
+/*
+ * ext4_ext_calc_credits_for_insert:
+ * This routine returns max. credits that the extent tree can consume.
+ * It should be OK for low-performance paths like ->writepage()
+ * To allow many writing processes to fit into a single transaction,
+ * the caller should calculate credits under truncate_mutex and
+ * pass the actual path.
+ */
+int inline ext4_ext_calc_credits_for_insert(struct inode *inode,
+						struct ext4_ext_path *path)
+{
+	int depth, needed;
+
+	if (path) {
+		/* probably there is space in leaf? */
+		depth = ext_depth(inode);
+		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
+				< le16_to_cpu(path[depth].p_hdr->eh_max))
+			return 1;
+	}
+
+	/*
+	 * given 32-bit logical block (4294967296 blocks), max. tree
+	 * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
+	 * Let's also add one more level for imbalance.
+	 */
+	depth = 5;
+
+	/* allocation of new data block(s) */
+	needed = 2;
+
+	/*
+	 * tree can be full, so it would need to grow in depth:
+	 * allocation + old root + new root
+	 */
+	needed += 2 + 1 + 1;
+
+	/*
+	 * Index split can happen, we would need:
+	 *    allocate intermediate indexes (bitmap + group)
+	 *  + change two blocks at each level, but root (already included)
+	 */
+	needed = (depth * 2) + (depth * 2);
+
+	/* any allocation modifies superblock */
+	needed += 1;
+
+	return needed;
+}
+
+static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
+				struct ext4_extent *ex,
+				unsigned long from, unsigned long to)
+{
+	struct buffer_head *bh;
+	int i;
+
+#ifdef EXTENTS_STATS
+	{
+		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+		unsigned short ee_len =  le16_to_cpu(ex->ee_len);
+		spin_lock(&sbi->s_ext_stats_lock);
+		sbi->s_ext_blocks += ee_len;
+		sbi->s_ext_extents++;
+		if (ee_len < sbi->s_ext_min)
+			sbi->s_ext_min = ee_len;
+		if (ee_len > sbi->s_ext_max)
+			sbi->s_ext_max = ee_len;
+		if (ext_depth(inode) > sbi->s_depth_max)
+			sbi->s_depth_max = ext_depth(inode);
+		spin_unlock(&sbi->s_ext_stats_lock);
+	}
+#endif
+	if (from >= le32_to_cpu(ex->ee_block)
+	    && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+		/* tail removal */
+		unsigned long num;
+		ext4_fsblk_t start;
+		num = le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - from;
+		start = ext_pblock(ex) + le16_to_cpu(ex->ee_len) - num;
+		ext_debug("free last %lu blocks starting %llu\n", num, start);
+		for (i = 0; i < num; i++) {
+			bh = sb_find_get_block(inode->i_sb, start + i);
+			ext4_forget(handle, 0, inode, bh, start + i);
+		}
+		ext4_free_blocks(handle, inode, start, num);
+	} else if (from == le32_to_cpu(ex->ee_block)
+		   && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) {
+		printk("strange request: removal %lu-%lu from %u:%u\n",
+		       from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+	} else {
+		printk("strange request: removal(2) %lu-%lu from %u:%u\n",
+		       from, to, le32_to_cpu(ex->ee_block), le16_to_cpu(ex->ee_len));
+	}
+	return 0;
+}
+
+static int
+ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
+		struct ext4_ext_path *path, unsigned long start)
+{
+	int err = 0, correct_index = 0;
+	int depth = ext_depth(inode), credits;
+	struct ext4_extent_header *eh;
+	unsigned a, b, block, num;
+	unsigned long ex_ee_block;
+	unsigned short ex_ee_len;
+	struct ext4_extent *ex;
+
+	ext_debug("truncate since %lu in leaf\n", start);
+	if (!path[depth].p_hdr)
+		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
+	eh = path[depth].p_hdr;
+	BUG_ON(eh == NULL);
+	BUG_ON(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max));
+	BUG_ON(eh->eh_magic != EXT4_EXT_MAGIC);
+
+	/* find where to start removing */
+	ex = EXT_LAST_EXTENT(eh);
+
+	ex_ee_block = le32_to_cpu(ex->ee_block);
+	ex_ee_len = le16_to_cpu(ex->ee_len);
+
+	while (ex >= EXT_FIRST_EXTENT(eh) &&
+			ex_ee_block + ex_ee_len > start) {
+		ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
+		path[depth].p_ext = ex;
+
+		a = ex_ee_block > start ? ex_ee_block : start;
+		b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
+			ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
+
+		ext_debug("  border %u:%u\n", a, b);
+
+		if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
+			block = 0;
+			num = 0;
+			BUG();
+		} else if (a != ex_ee_block) {
+			/* remove tail of the extent */
+			block = ex_ee_block;
+			num = a - block;
+		} else if (b != ex_ee_block + ex_ee_len - 1) {
+			/* remove head of the extent */
+			block = a;
+			num = b - a;
+			/* there is no "make a hole" API yet */
+			BUG();
+		} else {
+			/* remove whole extent: excellent! */
+			block = ex_ee_block;
+			num = 0;
+			BUG_ON(a != ex_ee_block);
+			BUG_ON(b != ex_ee_block + ex_ee_len - 1);
+		}
+
+		/* at present, extent can't cross block group: */
+		/* leaf + bitmap + group desc + sb + inode */
+		credits = 5;
+		if (ex == EXT_FIRST_EXTENT(eh)) {
+			correct_index = 1;
+			credits += (ext_depth(inode)) + 1;
+		}
+#ifdef CONFIG_QUOTA
+		credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+#endif
+
+		handle = ext4_ext_journal_restart(handle, credits);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto out;
+		}
+
+		err = ext4_ext_get_access(handle, inode, path + depth);
+		if (err)
+			goto out;
+
+		err = ext4_remove_blocks(handle, inode, ex, a, b);
+		if (err)
+			goto out;
+
+		if (num == 0) {
+			/* this extent is removed; mark slot entirely unused */
+			ext4_ext_store_pblock(ex, 0);
+			eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1);
+		}
+
+		ex->ee_block = cpu_to_le32(block);
+		ex->ee_len = cpu_to_le16(num);
+
+		err = ext4_ext_dirty(handle, inode, path + depth);
+		if (err)
+			goto out;
+
+		ext_debug("new extent: %u:%u:%llu\n", block, num,
+				ext_pblock(ex));
+		ex--;
+		ex_ee_block = le32_to_cpu(ex->ee_block);
+		ex_ee_len = le16_to_cpu(ex->ee_len);
+	}
+
+	if (correct_index && eh->eh_entries)
+		err = ext4_ext_correct_indexes(handle, inode, path);
+
+	/* if this leaf is free, then we should
+	 * remove it from index block above */
+	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
+		err = ext4_ext_rm_idx(handle, inode, path + depth);
+
+out:
+	return err;
+}
+
+/*
+ * ext4_ext_more_to_rm:
+ * returns 1 if current index has to be freed (even partial)
+ */
+static int inline
+ext4_ext_more_to_rm(struct ext4_ext_path *path)
+{
+	BUG_ON(path->p_idx == NULL);
+
+	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
+		return 0;
+
+	/*
+	 * if truncate on deeper level happened, it wasn't partial,
+	 * so we have to consider current index for truncation
+	 */
+	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
+		return 0;
+	return 1;
+}
+
+int ext4_ext_remove_space(struct inode *inode, unsigned long start)
+{
+	struct super_block *sb = inode->i_sb;
+	int depth = ext_depth(inode);
+	struct ext4_ext_path *path;
+	handle_t *handle;
+	int i = 0, err = 0;
+
+	ext_debug("truncate since %lu\n", start);
+
+	/* probably first extent we're gonna free will be last in block */
+	handle = ext4_journal_start(inode, depth + 1);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	ext4_ext_invalidate_cache(inode);
+
+	/*
+	 * We start scanning from right side, freeing all the blocks
+	 * after i_size and walking into the tree depth-wise.
+	 */
+	path = kmalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_KERNEL);
+	if (path == NULL) {
+		ext4_journal_stop(handle);
+		return -ENOMEM;
+	}
+	memset(path, 0, sizeof(struct ext4_ext_path) * (depth + 1));
+	path[0].p_hdr = ext_inode_hdr(inode);
+	if (ext4_ext_check_header(__FUNCTION__, inode, path[0].p_hdr)) {
+		err = -EIO;
+		goto out;
+	}
+	path[0].p_depth = depth;
+
+	while (i >= 0 && err == 0) {
+		if (i == depth) {
+			/* this is leaf block */
+			err = ext4_ext_rm_leaf(handle, inode, path, start);
+			/* root level has p_bh == NULL, brelse() eats this */
+			brelse(path[i].p_bh);
+			path[i].p_bh = NULL;
+			i--;
+			continue;
+		}
+
+		/* this is index block */
+		if (!path[i].p_hdr) {
+			ext_debug("initialize header\n");
+			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
+			if (ext4_ext_check_header(__FUNCTION__, inode,
+							path[i].p_hdr)) {
+				err = -EIO;
+				goto out;
+			}
+		}
+
+		BUG_ON(le16_to_cpu(path[i].p_hdr->eh_entries)
+			   > le16_to_cpu(path[i].p_hdr->eh_max));
+		BUG_ON(path[i].p_hdr->eh_magic != EXT4_EXT_MAGIC);
+
+		if (!path[i].p_idx) {
+			/* this level hasn't been touched yet */
+			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
+			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
+			ext_debug("init index ptr: hdr 0x%p, num %d\n",
+				  path[i].p_hdr,
+				  le16_to_cpu(path[i].p_hdr->eh_entries));
+		} else {
+			/* we were already here, see at next index */
+			path[i].p_idx--;
+		}
+
+		ext_debug("level %d - index, first 0x%p, cur 0x%p\n",
+				i, EXT_FIRST_INDEX(path[i].p_hdr),
+				path[i].p_idx);
+		if (ext4_ext_more_to_rm(path + i)) {
+			/* go to the next level */
+			ext_debug("move to level %d (block %llu)\n",
+				  i + 1, idx_pblock(path[i].p_idx));
+			memset(path + i + 1, 0, sizeof(*path));
+			path[i+1].p_bh =
+				sb_bread(sb, idx_pblock(path[i].p_idx));
+			if (!path[i+1].p_bh) {
+				/* should we reset i_size? */
+				err = -EIO;
+				break;
+			}
+
+			/* save actual number of indexes since this
+			 * number is changed at the next iteration */
+			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
+			i++;
+		} else {
+			/* we finished processing this index, go up */
+			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
+				/* index is empty, remove it;
+				 * handle must be already prepared by the
+				 * truncatei_leaf() */
+				err = ext4_ext_rm_idx(handle, inode, path + i);
+			}
+			/* root level has p_bh == NULL, brelse() eats this */
+			brelse(path[i].p_bh);
+			path[i].p_bh = NULL;
+			i--;
+			ext_debug("return to level %d\n", i);
+		}
+	}
+
+	/* TODO: flexible tree reduction should be here */
+	if (path->p_hdr->eh_entries == 0) {
+		/*
+		 * truncate to zero freed all the tree,
+		 * so we need to correct eh_depth
+		 */
+		err = ext4_ext_get_access(handle, inode, path);
+		if (err == 0) {
+			ext_inode_hdr(inode)->eh_depth = 0;
+			ext_inode_hdr(inode)->eh_max =
+				cpu_to_le16(ext4_ext_space_root(inode));
+			err = ext4_ext_dirty(handle, inode, path);
+		}
+	}
+out:
+	ext4_ext_tree_changed(inode);
+	ext4_ext_drop_refs(path);
+	kfree(path);
+	ext4_journal_stop(handle);
+
+	return err;
+}
+
+/*
+ * called at mount time
+ */
+void ext4_ext_init(struct super_block *sb)
+{
+	/*
+	 * possible initialization would be here
+	 */
+
+	if (test_opt(sb, EXTENTS)) {
+		printk("EXT4-fs: file extents enabled");
+#ifdef AGRESSIVE_TEST
+		printk(", agressive tests");
+#endif
+#ifdef CHECK_BINSEARCH
+		printk(", check binsearch");
+#endif
+#ifdef EXTENTS_STATS
+		printk(", stats");
+#endif
+		printk("\n");
+#ifdef EXTENTS_STATS
+		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
+		EXT4_SB(sb)->s_ext_min = 1 << 30;
+		EXT4_SB(sb)->s_ext_max = 0;
+#endif
+	}
+}
+
+/*
+ * called at umount time
+ */
+void ext4_ext_release(struct super_block *sb)
+{
+	if (!test_opt(sb, EXTENTS))
+		return;
+
+#ifdef EXTENTS_STATS
+	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
+		struct ext4_sb_info *sbi = EXT4_SB(sb);
+		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
+			sbi->s_ext_blocks, sbi->s_ext_extents,
+			sbi->s_ext_blocks / sbi->s_ext_extents);
+		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
+			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
+	}
+#endif
+}
+
+int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
+			ext4_fsblk_t iblock,
+			unsigned long max_blocks, struct buffer_head *bh_result,
+			int create, int extend_disksize)
+{
+	struct ext4_ext_path *path = NULL;
+	struct ext4_extent newex, *ex;
+	ext4_fsblk_t goal, newblock;
+	int err = 0, depth;
+	unsigned long allocated = 0;
+
+	__clear_bit(BH_New, &bh_result->b_state);
+	ext_debug("blocks %d/%lu requested for inode %u\n", (int) iblock,
+			max_blocks, (unsigned) inode->i_ino);
+	mutex_lock(&EXT4_I(inode)->truncate_mutex);
+
+	/* check in cache */
+	if ((goal = ext4_ext_in_cache(inode, iblock, &newex))) {
+		if (goal == EXT4_EXT_CACHE_GAP) {
+			if (!create) {
+				/* block isn't allocated yet and
+				 * user doesn't want to allocate it */
+				goto out2;
+			}
+			/* we should allocate requested block */
+		} else if (goal == EXT4_EXT_CACHE_EXTENT) {
+			/* block is already allocated */
+		        newblock = iblock
+		                   - le32_to_cpu(newex.ee_block)
+			           + ext_pblock(&newex);
+			/* number of remaining blocks in the extent */
+			allocated = le16_to_cpu(newex.ee_len) -
+					(iblock - le32_to_cpu(newex.ee_block));
+			goto out;
+		} else {
+			BUG();
+		}
+	}
+
+	/* find extent for this block */
+	path = ext4_ext_find_extent(inode, iblock, NULL);
+	if (IS_ERR(path)) {
+		err = PTR_ERR(path);
+		path = NULL;
+		goto out2;
+	}
+
+	depth = ext_depth(inode);
+
+	/*
+	 * consistent leaf must not be empty;
+	 * this situation is possible, though, _during_ tree modification;
+	 * this is why assert can't be put in ext4_ext_find_extent()
+	 */
+	BUG_ON(path[depth].p_ext == NULL && depth != 0);
+
+	if ((ex = path[depth].p_ext)) {
+	        unsigned long ee_block = le32_to_cpu(ex->ee_block);
+		ext4_fsblk_t ee_start = ext_pblock(ex);
+		unsigned short ee_len  = le16_to_cpu(ex->ee_len);
+
+		/*
+		 * Allow future support for preallocated extents to be added
+		 * as an RO_COMPAT feature:
+		 * Uninitialized extents are treated as holes, except that
+		 * we avoid (fail) allocating new blocks during a write.
+		 */
+		if (ee_len > EXT_MAX_LEN)
+			goto out2;
+		/* if found extent covers block, simply return it */
+	        if (iblock >= ee_block && iblock < ee_block + ee_len) {
+			newblock = iblock - ee_block + ee_start;
+			/* number of remaining blocks in the extent */
+			allocated = ee_len - (iblock - ee_block);
+			ext_debug("%d fit into %lu:%d -> %llu\n", (int) iblock,
+					ee_block, ee_len, newblock);
+			ext4_ext_put_in_cache(inode, ee_block, ee_len,
+						ee_start, EXT4_EXT_CACHE_EXTENT);
+			goto out;
+		}
+	}
+
+	/*
+	 * requested block isn't allocated yet;
+	 * we couldn't try to create block if create flag is zero
+	 */
+	if (!create) {
+		/* put just found gap into cache to speed up
+		 * subsequent requests */
+		ext4_ext_put_gap_in_cache(inode, path, iblock);
+		goto out2;
+	}
+	/*
+	 * Okay, we need to do block allocation.  Lazily initialize the block
+	 * allocation info here if necessary.
+	 */
+	if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
+		ext4_init_block_alloc_info(inode);
+
+	/* allocate new block */
+	goal = ext4_ext_find_goal(inode, path, iblock);
+	allocated = max_blocks;
+	newblock = ext4_new_blocks(handle, inode, goal, &allocated, &err);
+	if (!newblock)
+		goto out2;
+	ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
+			goal, newblock, allocated);
+
+	/* try to insert new extent into found leaf and return */
+	newex.ee_block = cpu_to_le32(iblock);
+	ext4_ext_store_pblock(&newex, newblock);
+	newex.ee_len = cpu_to_le16(allocated);
+	err = ext4_ext_insert_extent(handle, inode, path, &newex);
+	if (err)
+		goto out2;
+
+	if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
+		EXT4_I(inode)->i_disksize = inode->i_size;
+
+	/* previous routine could use block we allocated */
+	newblock = ext_pblock(&newex);
+	__set_bit(BH_New, &bh_result->b_state);
+
+	ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+				EXT4_EXT_CACHE_EXTENT);
+out:
+	if (allocated > max_blocks)
+		allocated = max_blocks;
+	ext4_ext_show_leaf(inode, path);
+	__set_bit(BH_Mapped, &bh_result->b_state);
+	bh_result->b_bdev = inode->i_sb->s_bdev;
+	bh_result->b_blocknr = newblock;
+out2:
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+	mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+
+	return err ? err : allocated;
+}
+
+void ext4_ext_truncate(struct inode * inode, struct page *page)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct super_block *sb = inode->i_sb;
+	unsigned long last_block;
+	handle_t *handle;
+	int err = 0;
+
+	/*
+	 * probably first extent we're gonna free will be last in block
+	 */
+	err = ext4_writepage_trans_blocks(inode) + 3;
+	handle = ext4_journal_start(inode, err);
+	if (IS_ERR(handle)) {
+		if (page) {
+			clear_highpage(page);
+			flush_dcache_page(page);
+			unlock_page(page);
+			page_cache_release(page);
+		}
+		return;
+	}
+
+	if (page)
+		ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+
+	mutex_lock(&EXT4_I(inode)->truncate_mutex);
+	ext4_ext_invalidate_cache(inode);
+
+	/*
+	 * TODO: optimization is possible here.
+	 * Probably we need not scan at all,
+	 * because page truncation is enough.
+	 */
+	if (ext4_orphan_add(handle, inode))
+		goto out_stop;
+
+	/* we have to know where to truncate from in crash case */
+	EXT4_I(inode)->i_disksize = inode->i_size;
+	ext4_mark_inode_dirty(handle, inode);
+
+	last_block = (inode->i_size + sb->s_blocksize - 1)
+			>> EXT4_BLOCK_SIZE_BITS(sb);
+	err = ext4_ext_remove_space(inode, last_block);
+
+	/* In a multi-transaction truncate, we only make the final
+	 * transaction synchronous. */
+	if (IS_SYNC(inode))
+		handle->h_sync = 1;
+
+out_stop:
+	/*
+	 * If this was a simple ftruncate() and the file will remain alive,
+	 * then we need to clear up the orphan record which we created above.
+	 * However, if this was a real unlink then we were called by
+	 * ext4_delete_inode(), and we allow that function to clean up the
+	 * orphan info for us.
+	 */
+	if (inode->i_nlink)
+		ext4_orphan_del(handle, inode);
+
+	mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+	ext4_journal_stop(handle);
+}
+
+/*
+ * ext4_ext_writepage_trans_blocks:
+ * calculate max number of blocks we could modify
+ * in order to allocate new block for an inode
+ */
+int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
+{
+	int needed;
+
+	needed = ext4_ext_calc_credits_for_insert(inode, NULL);
+
+	/* caller wants to allocate num blocks, but note it includes sb */
+	needed = needed * num - (num - 1);
+
+#ifdef CONFIG_QUOTA
+	needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+#endif
+
+	return needed;
+}
+
+EXPORT_SYMBOL(ext4_mark_inode_dirty);
+EXPORT_SYMBOL(ext4_ext_invalidate_cache);
+EXPORT_SYMBOL(ext4_ext_insert_extent);
+EXPORT_SYMBOL(ext4_ext_walk_space);
+EXPORT_SYMBOL(ext4_ext_find_goal);
+EXPORT_SYMBOL(ext4_ext_calc_credits_for_insert);
+
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
new file mode 100644
index 000000000000..0b622c0624b7
--- /dev/null
+++ b/fs/ext4/file.c
@@ -0,0 +1,139 @@
+/*
+ *  linux/fs/ext4/file.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/file.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext4 fs regular file handling primitives
+ *
+ *  64-bit file support on 64-bit platforms by Jakub Jelinek
+ *	(jj@sunsite.ms.mff.cuni.cz)
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Called when an inode is released. Note that this is different
+ * from ext4_file_open: open gets called at every open, but release
+ * gets called only when /all/ the files are closed.
+ */
+static int ext4_release_file (struct inode * inode, struct file * filp)
+{
+	/* if we are the last writer on the inode, drop the block reservation */
+	if ((filp->f_mode & FMODE_WRITE) &&
+			(atomic_read(&inode->i_writecount) == 1))
+	{
+		mutex_lock(&EXT4_I(inode)->truncate_mutex);
+		ext4_discard_reservation(inode);
+		mutex_unlock(&EXT4_I(inode)->truncate_mutex);
+	}
+	if (is_dx(inode) && filp->private_data)
+		ext4_htree_free_dir_info(filp->private_data);
+
+	return 0;
+}
+
+static ssize_t
+ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_dentry->d_inode;
+	ssize_t ret;
+	int err;
+
+	ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+
+	/*
+	 * Skip flushing if there was an error, or if nothing was written.
+	 */
+	if (ret <= 0)
+		return ret;
+
+	/*
+	 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
+	 * journalling then we need to make sure that we force the transaction
+	 * to disk to keep all metadata uptodate synchronously.
+	 */
+	if (file->f_flags & O_SYNC) {
+		/*
+		 * If we are non-data-journaled, then the dirty data has
+		 * already been flushed to backing store by generic_osync_inode,
+		 * and the inode has been flushed too if there have been any
+		 * modifications other than mere timestamp updates.
+		 *
+		 * Open question --- do we care about flushing timestamps too
+		 * if the inode is IS_SYNC?
+		 */
+		if (!ext4_should_journal_data(inode))
+			return ret;
+
+		goto force_commit;
+	}
+
+	/*
+	 * So we know that there has been no forced data flush.  If the inode
+	 * is marked IS_SYNC, we need to force one ourselves.
+	 */
+	if (!IS_SYNC(inode))
+		return ret;
+
+	/*
+	 * Open question #2 --- should we force data to disk here too?  If we
+	 * don't, the only impact is that data=writeback filesystems won't
+	 * flush data to disk automatically on IS_SYNC, only metadata (but
+	 * historically, that is what ext2 has done.)
+	 */
+
+force_commit:
+	err = ext4_force_commit(inode->i_sb);
+	if (err)
+		return err;
+	return ret;
+}
+
+const struct file_operations ext4_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= ext4_file_write,
+	.ioctl		= ext4_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ext4_compat_ioctl,
+#endif
+	.mmap		= generic_file_mmap,
+	.open		= generic_file_open,
+	.release	= ext4_release_file,
+	.fsync		= ext4_sync_file,
+	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+};
+
+struct inode_operations ext4_file_inode_operations = {
+	.truncate	= ext4_truncate,
+	.setattr	= ext4_setattr,
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext4_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.permission	= ext4_permission,
+};
+
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
new file mode 100644
index 000000000000..2a167d7131fa
--- /dev/null
+++ b/fs/ext4/fsync.c
@@ -0,0 +1,88 @@
+/*
+ *  linux/fs/ext4/fsync.c
+ *
+ *  Copyright (C) 1993  Stephen Tweedie (sct@redhat.com)
+ *  from
+ *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
+ *                      Laboratoire MASI - Institut Blaise Pascal
+ *                      Universite Pierre et Marie Curie (Paris VI)
+ *  from
+ *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext4fs fsync primitive
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ *  Removed unnecessary code duplication for little endian machines
+ *  and excessive __inline__s.
+ *        Andi Kleen, 1997
+ *
+ * Major simplications and cleanup - we only need to do the metadata, because
+ * we can depend on generic_block_fdatasync() to sync the data blocks.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+
+/*
+ * akpm: A new design for ext4_sync_file().
+ *
+ * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
+ * There cannot be a transaction open by this task.
+ * Another task could have dirtied this inode.  Its data can be in any
+ * state in the journalling system.
+ *
+ * What we do is just kick off a commit and wait on it.  This will snapshot the
+ * inode to disk.
+ */
+
+int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	int ret = 0;
+
+	J_ASSERT(ext4_journal_current_handle() == 0);
+
+	/*
+	 * data=writeback:
+	 *  The caller's filemap_fdatawrite()/wait will sync the data.
+	 *  sync_inode() will sync the metadata
+	 *
+	 * data=ordered:
+	 *  The caller's filemap_fdatawrite() will write the data and
+	 *  sync_inode() will write the inode if it is dirty.  Then the caller's
+	 *  filemap_fdatawait() will wait on the pages.
+	 *
+	 * data=journal:
+	 *  filemap_fdatawrite won't do anything (the buffers are clean).
+	 *  ext4_force_commit will write the file data into the journal and
+	 *  will wait on that.
+	 *  filemap_fdatawait() will encounter a ton of newly-dirtied pages
+	 *  (they were dirtied by commit).  But that's OK - the blocks are
+	 *  safe in-journal, which is all fsync() needs to ensure.
+	 */
+	if (ext4_should_journal_data(inode)) {
+		ret = ext4_force_commit(inode->i_sb);
+		goto out;
+	}
+
+	/*
+	 * The VFS has written the file data.  If the inode is unaltered
+	 * then we need not start a commit.
+	 */
+	if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_ALL,
+			.nr_to_write = 0, /* sys_fsync did this */
+		};
+		ret = sync_inode(inode, &wbc);
+	}
+out:
+	return ret;
+}
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
new file mode 100644
index 000000000000..a67966385e06
--- /dev/null
+++ b/fs/ext4/hash.c
@@ -0,0 +1,152 @@
+/*
+ *  linux/fs/ext4/hash.c
+ *
+ * Copyright (C) 2002 by Theodore Ts'o
+ *
+ * This file is released under the GPL v2.
+ *
+ * This file may be redistributed under the terms of the GNU Public
+ * License.
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/sched.h>
+#include <linux/ext4_fs.h>
+#include <linux/cryptohash.h>
+
+#define DELTA 0x9E3779B9
+
+static void TEA_transform(__u32 buf[4], __u32 const in[])
+{
+	__u32	sum = 0;
+	__u32	b0 = buf[0], b1 = buf[1];
+	__u32	a = in[0], b = in[1], c = in[2], d = in[3];
+	int	n = 16;
+
+	do {
+		sum += DELTA;
+		b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+		b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+	} while(--n);
+
+	buf[0] += b0;
+	buf[1] += b1;
+}
+
+
+/* The old legacy hash */
+static __u32 dx_hack_hash (const char *name, int len)
+{
+	__u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	while (len--) {
+		__u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
+
+		if (hash & 0x80000000) hash -= 0x7fffffff;
+		hash1 = hash0;
+		hash0 = hash;
+	}
+	return (hash0 << 1);
+}
+
+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+{
+	__u32	pad, val;
+	int	i;
+
+	pad = (__u32)len | ((__u32)len << 8);
+	pad |= pad << 16;
+
+	val = pad;
+	if (len > num*4)
+		len = num * 4;
+	for (i=0; i < len; i++) {
+		if ((i % 4) == 0)
+			val = pad;
+		val = msg[i] + (val << 8);
+		if ((i % 4) == 3) {
+			*buf++ = val;
+			val = pad;
+			num--;
+		}
+	}
+	if (--num >= 0)
+		*buf++ = val;
+	while (--num >= 0)
+		*buf++ = pad;
+}
+
+/*
+ * Returns the hash of a filename.  If len is 0 and name is NULL, then
+ * this function can be used to test whether or not a hash version is
+ * supported.
+ *
+ * The seed is an 4 longword (32 bits) "secret" which can be used to
+ * uniquify a hash.  If the seed is all zero's, then some default seed
+ * may be used.
+ *
+ * A particular hash version specifies whether or not the seed is
+ * represented, and whether or not the returned hash is 32 bits or 64
+ * bits.  32 bit hashes will return 0 for the minor hash.
+ */
+int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
+{
+	__u32	hash;
+	__u32	minor_hash = 0;
+	const char	*p;
+	int		i;
+	__u32		in[8], buf[4];
+
+	/* Initialize the default seed for the hash checksum functions */
+	buf[0] = 0x67452301;
+	buf[1] = 0xefcdab89;
+	buf[2] = 0x98badcfe;
+	buf[3] = 0x10325476;
+
+	/* Check to see if the seed is all zero's */
+	if (hinfo->seed) {
+		for (i=0; i < 4; i++) {
+			if (hinfo->seed[i])
+				break;
+		}
+		if (i < 4)
+			memcpy(buf, hinfo->seed, sizeof(buf));
+	}
+
+	switch (hinfo->hash_version) {
+	case DX_HASH_LEGACY:
+		hash = dx_hack_hash(name, len);
+		break;
+	case DX_HASH_HALF_MD4:
+		p = name;
+		while (len > 0) {
+			str2hashbuf(p, len, in, 8);
+			half_md4_transform(buf, in);
+			len -= 32;
+			p += 32;
+		}
+		minor_hash = buf[2];
+		hash = buf[1];
+		break;
+	case DX_HASH_TEA:
+		p = name;
+		while (len > 0) {
+			str2hashbuf(p, len, in, 4);
+			TEA_transform(buf, in);
+			len -= 16;
+			p += 16;
+		}
+		hash = buf[0];
+		minor_hash = buf[1];
+		break;
+	default:
+		hinfo->hash = 0;
+		return -1;
+	}
+	hash = hash & ~1;
+	if (hash == (EXT4_HTREE_EOF << 1))
+		hash = (EXT4_HTREE_EOF-1) << 1;
+	hinfo->hash = hash;
+	hinfo->minor_hash = minor_hash;
+	return 0;
+}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
new file mode 100644
index 000000000000..c88b439ba5cd
--- /dev/null
+++ b/fs/ext4/ialloc.c
@@ -0,0 +1,772 @@
+/*
+ *  linux/fs/ext4/ialloc.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  BSD ufs-inspired inode and directory allocation by
+ *  Stephen Tweedie (sct@redhat.com), 1993
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/random.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <asm/byteorder.h>
+
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * ialloc.c contains the inodes allocation and deallocation routines
+ */
+
+/*
+ * The free inodes are managed by bitmaps.  A file system contains several
+ * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+ * block for inodes, N blocks for the inode table and data blocks.
+ *
+ * The file system contains group descriptors which are located after the
+ * super block.  Each descriptor contains the number of the bitmap block and
+ * the free blocks count in the block.
+ */
+
+
+/*
+ * Read the inode allocation bitmap for a given block_group, reading
+ * into the specified slot in the superblock's bitmap cache.
+ *
+ * Return buffer_head of bitmap on success or NULL.
+ */
+static struct buffer_head *
+read_inode_bitmap(struct super_block * sb, unsigned long block_group)
+{
+	struct ext4_group_desc *desc;
+	struct buffer_head *bh = NULL;
+
+	desc = ext4_get_group_desc(sb, block_group, NULL);
+	if (!desc)
+		goto error_out;
+
+	bh = sb_bread(sb, ext4_inode_bitmap(sb, desc));
+	if (!bh)
+		ext4_error(sb, "read_inode_bitmap",
+			    "Cannot read inode bitmap - "
+			    "block_group = %lu, inode_bitmap = %llu",
+			    block_group, ext4_inode_bitmap(sb, desc));
+error_out:
+	return bh;
+}
+
+/*
+ * NOTE! When we get the inode, we're the only people
+ * that have access to it, and as such there are no
+ * race conditions we have to worry about. The inode
+ * is not on the hash-lists, and it cannot be reached
+ * through the filesystem because the directory entry
+ * has been deleted earlier.
+ *
+ * HOWEVER: we must make sure that we get no aliases,
+ * which means that we have to call "clear_inode()"
+ * _before_ we mark the inode not in use in the inode
+ * bitmaps. Otherwise a newly created file might use
+ * the same inode number (not actually the same pointer
+ * though), and then we'd have two inodes sharing the
+ * same inode number and space on the harddisk.
+ */
+void ext4_free_inode (handle_t *handle, struct inode * inode)
+{
+	struct super_block * sb = inode->i_sb;
+	int is_directory;
+	unsigned long ino;
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *bh2;
+	unsigned long block_group;
+	unsigned long bit;
+	struct ext4_group_desc * gdp;
+	struct ext4_super_block * es;
+	struct ext4_sb_info *sbi;
+	int fatal = 0, err;
+
+	if (atomic_read(&inode->i_count) > 1) {
+		printk ("ext4_free_inode: inode has count=%d\n",
+					atomic_read(&inode->i_count));
+		return;
+	}
+	if (inode->i_nlink) {
+		printk ("ext4_free_inode: inode has nlink=%d\n",
+			inode->i_nlink);
+		return;
+	}
+	if (!sb) {
+		printk("ext4_free_inode: inode on nonexistent device\n");
+		return;
+	}
+	sbi = EXT4_SB(sb);
+
+	ino = inode->i_ino;
+	ext4_debug ("freeing inode %lu\n", ino);
+
+	/*
+	 * Note: we must free any quota before locking the superblock,
+	 * as writing the quota to disk may need the lock as well.
+	 */
+	DQUOT_INIT(inode);
+	ext4_xattr_delete_inode(handle, inode);
+	DQUOT_FREE_INODE(inode);
+	DQUOT_DROP(inode);
+
+	is_directory = S_ISDIR(inode->i_mode);
+
+	/* Do this BEFORE marking the inode not in use or returning an error */
+	clear_inode (inode);
+
+	es = EXT4_SB(sb)->s_es;
+	if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+		ext4_error (sb, "ext4_free_inode",
+			    "reserved or nonexistent inode %lu", ino);
+		goto error_return;
+	}
+	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
+	bitmap_bh = read_inode_bitmap(sb, block_group);
+	if (!bitmap_bh)
+		goto error_return;
+
+	BUFFER_TRACE(bitmap_bh, "get_write_access");
+	fatal = ext4_journal_get_write_access(handle, bitmap_bh);
+	if (fatal)
+		goto error_return;
+
+	/* Ok, now we can actually update the inode bitmaps.. */
+	if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+					bit, bitmap_bh->b_data))
+		ext4_error (sb, "ext4_free_inode",
+			      "bit already cleared for inode %lu", ino);
+	else {
+		gdp = ext4_get_group_desc (sb, block_group, &bh2);
+
+		BUFFER_TRACE(bh2, "get_write_access");
+		fatal = ext4_journal_get_write_access(handle, bh2);
+		if (fatal) goto error_return;
+
+		if (gdp) {
+			spin_lock(sb_bgl_lock(sbi, block_group));
+			gdp->bg_free_inodes_count = cpu_to_le16(
+				le16_to_cpu(gdp->bg_free_inodes_count) + 1);
+			if (is_directory)
+				gdp->bg_used_dirs_count = cpu_to_le16(
+				  le16_to_cpu(gdp->bg_used_dirs_count) - 1);
+			spin_unlock(sb_bgl_lock(sbi, block_group));
+			percpu_counter_inc(&sbi->s_freeinodes_counter);
+			if (is_directory)
+				percpu_counter_dec(&sbi->s_dirs_counter);
+
+		}
+		BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
+		err = ext4_journal_dirty_metadata(handle, bh2);
+		if (!fatal) fatal = err;
+	}
+	BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata");
+	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+	if (!fatal)
+		fatal = err;
+	sb->s_dirt = 1;
+error_return:
+	brelse(bitmap_bh);
+	ext4_std_error(sb, fatal);
+}
+
+/*
+ * There are two policies for allocating an inode.  If the new inode is
+ * a directory, then a forward search is made for a block group with both
+ * free space and a low directory-to-inode ratio; if that fails, then of
+ * the groups with above-average free space, that group with the fewest
+ * directories already is chosen.
+ *
+ * For other inodes, search forward from the parent directory\'s block
+ * group to find a free inode.
+ */
+static int find_group_dir(struct super_block *sb, struct inode *parent)
+{
+	int ngroups = EXT4_SB(sb)->s_groups_count;
+	unsigned int freei, avefreei;
+	struct ext4_group_desc *desc, *best_desc = NULL;
+	struct buffer_head *bh;
+	int group, best_group = -1;
+
+	freei = percpu_counter_read_positive(&EXT4_SB(sb)->s_freeinodes_counter);
+	avefreei = freei / ngroups;
+
+	for (group = 0; group < ngroups; group++) {
+		desc = ext4_get_group_desc (sb, group, &bh);
+		if (!desc || !desc->bg_free_inodes_count)
+			continue;
+		if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+			continue;
+		if (!best_desc ||
+		    (le16_to_cpu(desc->bg_free_blocks_count) >
+		     le16_to_cpu(best_desc->bg_free_blocks_count))) {
+			best_group = group;
+			best_desc = desc;
+		}
+	}
+	return best_group;
+}
+
+/*
+ * Orlov's allocator for directories.
+ *
+ * We always try to spread first-level directories.
+ *
+ * If there are blockgroups with both free inodes and free blocks counts
+ * not worse than average we return one with smallest directory count.
+ * Otherwise we simply return a random group.
+ *
+ * For the rest rules look so:
+ *
+ * It's OK to put directory into a group unless
+ * it has too many directories already (max_dirs) or
+ * it has too few free inodes left (min_inodes) or
+ * it has too few free blocks left (min_blocks) or
+ * it's already running too large debt (max_debt).
+ * Parent's group is prefered, if it doesn't satisfy these
+ * conditions we search cyclically through the rest. If none
+ * of the groups look good we just look for a group with more
+ * free inodes than average (starting at parent's group).
+ *
+ * Debt is incremented each time we allocate a directory and decremented
+ * when we allocate an inode, within 0--255.
+ */
+
+#define INODE_COST 64
+#define BLOCK_COST 256
+
+static int find_group_orlov(struct super_block *sb, struct inode *parent)
+{
+	int parent_group = EXT4_I(parent)->i_block_group;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int ngroups = sbi->s_groups_count;
+	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
+	unsigned int freei, avefreei;
+	ext4_fsblk_t freeb, avefreeb;
+	ext4_fsblk_t blocks_per_dir;
+	unsigned int ndirs;
+	int max_debt, max_dirs, min_inodes;
+	ext4_grpblk_t min_blocks;
+	int group = -1, i;
+	struct ext4_group_desc *desc;
+	struct buffer_head *bh;
+
+	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
+	avefreei = freei / ngroups;
+	freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+	avefreeb = freeb;
+	do_div(avefreeb, ngroups);
+	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
+
+	if ((parent == sb->s_root->d_inode) ||
+	    (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
+		int best_ndir = inodes_per_group;
+		int best_group = -1;
+
+		get_random_bytes(&group, sizeof(group));
+		parent_group = (unsigned)group % ngroups;
+		for (i = 0; i < ngroups; i++) {
+			group = (parent_group + i) % ngroups;
+			desc = ext4_get_group_desc (sb, group, &bh);
+			if (!desc || !desc->bg_free_inodes_count)
+				continue;
+			if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
+				continue;
+			if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+				continue;
+			if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
+				continue;
+			best_group = group;
+			best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
+		}
+		if (best_group >= 0)
+			return best_group;
+		goto fallback;
+	}
+
+	blocks_per_dir = ext4_blocks_count(es) - freeb;
+	do_div(blocks_per_dir, ndirs);
+
+	max_dirs = ndirs / ngroups + inodes_per_group / 16;
+	min_inodes = avefreei - inodes_per_group / 4;
+	min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
+
+	max_debt = EXT4_BLOCKS_PER_GROUP(sb);
+	max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
+	if (max_debt * INODE_COST > inodes_per_group)
+		max_debt = inodes_per_group / INODE_COST;
+	if (max_debt > 255)
+		max_debt = 255;
+	if (max_debt == 0)
+		max_debt = 1;
+
+	for (i = 0; i < ngroups; i++) {
+		group = (parent_group + i) % ngroups;
+		desc = ext4_get_group_desc (sb, group, &bh);
+		if (!desc || !desc->bg_free_inodes_count)
+			continue;
+		if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
+			continue;
+		if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
+			continue;
+		if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
+			continue;
+		return group;
+	}
+
+fallback:
+	for (i = 0; i < ngroups; i++) {
+		group = (parent_group + i) % ngroups;
+		desc = ext4_get_group_desc (sb, group, &bh);
+		if (!desc || !desc->bg_free_inodes_count)
+			continue;
+		if (le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
+			return group;
+	}
+
+	if (avefreei) {
+		/*
+		 * The free-inodes counter is approximate, and for really small
+		 * filesystems the above test can fail to find any blockgroups
+		 */
+		avefreei = 0;
+		goto fallback;
+	}
+
+	return -1;
+}
+
+static int find_group_other(struct super_block *sb, struct inode *parent)
+{
+	int parent_group = EXT4_I(parent)->i_block_group;
+	int ngroups = EXT4_SB(sb)->s_groups_count;
+	struct ext4_group_desc *desc;
+	struct buffer_head *bh;
+	int group, i;
+
+	/*
+	 * Try to place the inode in its parent directory
+	 */
+	group = parent_group;
+	desc = ext4_get_group_desc (sb, group, &bh);
+	if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+			le16_to_cpu(desc->bg_free_blocks_count))
+		return group;
+
+	/*
+	 * We're going to place this inode in a different blockgroup from its
+	 * parent.  We want to cause files in a common directory to all land in
+	 * the same blockgroup.  But we want files which are in a different
+	 * directory which shares a blockgroup with our parent to land in a
+	 * different blockgroup.
+	 *
+	 * So add our directory's i_ino into the starting point for the hash.
+	 */
+	group = (group + parent->i_ino) % ngroups;
+
+	/*
+	 * Use a quadratic hash to find a group with a free inode and some free
+	 * blocks.
+	 */
+	for (i = 1; i < ngroups; i <<= 1) {
+		group += i;
+		if (group >= ngroups)
+			group -= ngroups;
+		desc = ext4_get_group_desc (sb, group, &bh);
+		if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
+				le16_to_cpu(desc->bg_free_blocks_count))
+			return group;
+	}
+
+	/*
+	 * That failed: try linear search for a free inode, even if that group
+	 * has no free blocks.
+	 */
+	group = parent_group;
+	for (i = 0; i < ngroups; i++) {
+		if (++group >= ngroups)
+			group = 0;
+		desc = ext4_get_group_desc (sb, group, &bh);
+		if (desc && le16_to_cpu(desc->bg_free_inodes_count))
+			return group;
+	}
+
+	return -1;
+}
+
+/*
+ * There are two policies for allocating an inode.  If the new inode is
+ * a directory, then a forward search is made for a block group with both
+ * free space and a low directory-to-inode ratio; if that fails, then of
+ * the groups with above-average free space, that group with the fewest
+ * directories already is chosen.
+ *
+ * For other inodes, search forward from the parent directory's block
+ * group to find a free inode.
+ */
+struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
+{
+	struct super_block *sb;
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *bh2;
+	int group;
+	unsigned long ino = 0;
+	struct inode * inode;
+	struct ext4_group_desc * gdp = NULL;
+	struct ext4_super_block * es;
+	struct ext4_inode_info *ei;
+	struct ext4_sb_info *sbi;
+	int err = 0;
+	struct inode *ret;
+	int i;
+
+	/* Cannot create files in a deleted directory */
+	if (!dir || !dir->i_nlink)
+		return ERR_PTR(-EPERM);
+
+	sb = dir->i_sb;
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	ei = EXT4_I(inode);
+
+	sbi = EXT4_SB(sb);
+	es = sbi->s_es;
+	if (S_ISDIR(mode)) {
+		if (test_opt (sb, OLDALLOC))
+			group = find_group_dir(sb, dir);
+		else
+			group = find_group_orlov(sb, dir);
+	} else
+		group = find_group_other(sb, dir);
+
+	err = -ENOSPC;
+	if (group == -1)
+		goto out;
+
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		err = -EIO;
+
+		gdp = ext4_get_group_desc(sb, group, &bh2);
+		if (!gdp)
+			goto fail;
+
+		brelse(bitmap_bh);
+		bitmap_bh = read_inode_bitmap(sb, group);
+		if (!bitmap_bh)
+			goto fail;
+
+		ino = 0;
+
+repeat_in_this_group:
+		ino = ext4_find_next_zero_bit((unsigned long *)
+				bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino);
+		if (ino < EXT4_INODES_PER_GROUP(sb)) {
+
+			BUFFER_TRACE(bitmap_bh, "get_write_access");
+			err = ext4_journal_get_write_access(handle, bitmap_bh);
+			if (err)
+				goto fail;
+
+			if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
+						ino, bitmap_bh->b_data)) {
+				/* we won it */
+				BUFFER_TRACE(bitmap_bh,
+					"call ext4_journal_dirty_metadata");
+				err = ext4_journal_dirty_metadata(handle,
+								bitmap_bh);
+				if (err)
+					goto fail;
+				goto got;
+			}
+			/* we lost it */
+			jbd2_journal_release_buffer(handle, bitmap_bh);
+
+			if (++ino < EXT4_INODES_PER_GROUP(sb))
+				goto repeat_in_this_group;
+		}
+
+		/*
+		 * This case is possible in concurrent environment.  It is very
+		 * rare.  We cannot repeat the find_group_xxx() call because
+		 * that will simply return the same blockgroup, because the
+		 * group descriptor metadata has not yet been updated.
+		 * So we just go onto the next blockgroup.
+		 */
+		if (++group == sbi->s_groups_count)
+			group = 0;
+	}
+	err = -ENOSPC;
+	goto out;
+
+got:
+	ino += group * EXT4_INODES_PER_GROUP(sb) + 1;
+	if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+		ext4_error (sb, "ext4_new_inode",
+			    "reserved inode or inode > inodes count - "
+			    "block_group = %d, inode=%lu", group, ino);
+		err = -EIO;
+		goto fail;
+	}
+
+	BUFFER_TRACE(bh2, "get_write_access");
+	err = ext4_journal_get_write_access(handle, bh2);
+	if (err) goto fail;
+	spin_lock(sb_bgl_lock(sbi, group));
+	gdp->bg_free_inodes_count =
+		cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
+	if (S_ISDIR(mode)) {
+		gdp->bg_used_dirs_count =
+			cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
+	}
+	spin_unlock(sb_bgl_lock(sbi, group));
+	BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
+	err = ext4_journal_dirty_metadata(handle, bh2);
+	if (err) goto fail;
+
+	percpu_counter_dec(&sbi->s_freeinodes_counter);
+	if (S_ISDIR(mode))
+		percpu_counter_inc(&sbi->s_dirs_counter);
+	sb->s_dirt = 1;
+
+	inode->i_uid = current->fsuid;
+	if (test_opt (sb, GRPID))
+		inode->i_gid = dir->i_gid;
+	else if (dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		inode->i_gid = current->fsgid;
+	inode->i_mode = mode;
+
+	inode->i_ino = ino;
+	/* This is the optimal IO size (for stat), not the fs block size */
+	inode->i_blocks = 0;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+
+	memset(ei->i_data, 0, sizeof(ei->i_data));
+	ei->i_dir_start_lookup = 0;
+	ei->i_disksize = 0;
+
+	ei->i_flags = EXT4_I(dir)->i_flags & ~EXT4_INDEX_FL;
+	if (S_ISLNK(mode))
+		ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
+	/* dirsync only applies to directories */
+	if (!S_ISDIR(mode))
+		ei->i_flags &= ~EXT4_DIRSYNC_FL;
+#ifdef EXT4_FRAGMENTS
+	ei->i_faddr = 0;
+	ei->i_frag_no = 0;
+	ei->i_frag_size = 0;
+#endif
+	ei->i_file_acl = 0;
+	ei->i_dir_acl = 0;
+	ei->i_dtime = 0;
+	ei->i_block_alloc_info = NULL;
+	ei->i_block_group = group;
+
+	ext4_set_inode_flags(inode);
+	if (IS_DIRSYNC(inode))
+		handle->h_sync = 1;
+	insert_inode_hash(inode);
+	spin_lock(&sbi->s_next_gen_lock);
+	inode->i_generation = sbi->s_next_generation++;
+	spin_unlock(&sbi->s_next_gen_lock);
+
+	ei->i_state = EXT4_STATE_NEW;
+	ei->i_extra_isize =
+		(EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) ?
+		sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE : 0;
+
+	ret = inode;
+	if(DQUOT_ALLOC_INODE(inode)) {
+		err = -EDQUOT;
+		goto fail_drop;
+	}
+
+	err = ext4_init_acl(handle, inode, dir);
+	if (err)
+		goto fail_free_drop;
+
+	err = ext4_init_security(handle,inode, dir);
+	if (err)
+		goto fail_free_drop;
+
+	err = ext4_mark_inode_dirty(handle, inode);
+	if (err) {
+		ext4_std_error(sb, err);
+		goto fail_free_drop;
+	}
+	if (test_opt(sb, EXTENTS)) {
+		EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
+		ext4_ext_tree_init(handle, inode);
+		if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+			err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+			if (err) goto fail;
+			EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS);
+			BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "call ext4_journal_dirty_metadata");
+			err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+		}
+	}
+
+	ext4_debug("allocating inode %lu\n", inode->i_ino);
+	goto really_out;
+fail:
+	ext4_std_error(sb, err);
+out:
+	iput(inode);
+	ret = ERR_PTR(err);
+really_out:
+	brelse(bitmap_bh);
+	return ret;
+
+fail_free_drop:
+	DQUOT_FREE_INODE(inode);
+
+fail_drop:
+	DQUOT_DROP(inode);
+	inode->i_flags |= S_NOQUOTA;
+	inode->i_nlink = 0;
+	iput(inode);
+	brelse(bitmap_bh);
+	return ERR_PTR(err);
+}
+
+/* Verify that we are loading a valid orphan from disk */
+struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
+{
+	unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
+	unsigned long block_group;
+	int bit;
+	struct buffer_head *bitmap_bh = NULL;
+	struct inode *inode = NULL;
+
+	/* Error cases - e2fsck has already cleaned up for us */
+	if (ino > max_ino) {
+		ext4_warning(sb, __FUNCTION__,
+			     "bad orphan ino %lu!  e2fsck was run?", ino);
+		goto out;
+	}
+
+	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
+	bitmap_bh = read_inode_bitmap(sb, block_group);
+	if (!bitmap_bh) {
+		ext4_warning(sb, __FUNCTION__,
+			     "inode bitmap error for orphan %lu", ino);
+		goto out;
+	}
+
+	/* Having the inode bit set should be a 100% indicator that this
+	 * is a valid orphan (no e2fsck run on fs).  Orphans also include
+	 * inodes that were being truncated, so we can't check i_nlink==0.
+	 */
+	if (!ext4_test_bit(bit, bitmap_bh->b_data) ||
+			!(inode = iget(sb, ino)) || is_bad_inode(inode) ||
+			NEXT_ORPHAN(inode) > max_ino) {
+		ext4_warning(sb, __FUNCTION__,
+			     "bad orphan inode %lu!  e2fsck was run?", ino);
+		printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
+		       bit, (unsigned long long)bitmap_bh->b_blocknr,
+		       ext4_test_bit(bit, bitmap_bh->b_data));
+		printk(KERN_NOTICE "inode=%p\n", inode);
+		if (inode) {
+			printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
+			       is_bad_inode(inode));
+			printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
+			       NEXT_ORPHAN(inode));
+			printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+		}
+		/* Avoid freeing blocks if we got a bad deleted inode */
+		if (inode && inode->i_nlink == 0)
+			inode->i_blocks = 0;
+		iput(inode);
+		inode = NULL;
+	}
+out:
+	brelse(bitmap_bh);
+	return inode;
+}
+
+unsigned long ext4_count_free_inodes (struct super_block * sb)
+{
+	unsigned long desc_count;
+	struct ext4_group_desc *gdp;
+	int i;
+#ifdef EXT4FS_DEBUG
+	struct ext4_super_block *es;
+	unsigned long bitmap_count, x;
+	struct buffer_head *bitmap_bh = NULL;
+
+	es = EXT4_SB(sb)->s_es;
+	desc_count = 0;
+	bitmap_count = 0;
+	gdp = NULL;
+	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+		gdp = ext4_get_group_desc (sb, i, NULL);
+		if (!gdp)
+			continue;
+		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+		brelse(bitmap_bh);
+		bitmap_bh = read_inode_bitmap(sb, i);
+		if (!bitmap_bh)
+			continue;
+
+		x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
+		printk("group %d: stored = %d, counted = %lu\n",
+			i, le16_to_cpu(gdp->bg_free_inodes_count), x);
+		bitmap_count += x;
+	}
+	brelse(bitmap_bh);
+	printk("ext4_count_free_inodes: stored = %u, computed = %lu, %lu\n",
+		le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
+	return desc_count;
+#else
+	desc_count = 0;
+	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+		gdp = ext4_get_group_desc (sb, i, NULL);
+		if (!gdp)
+			continue;
+		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+		cond_resched();
+	}
+	return desc_count;
+#endif
+}
+
+/* Called at mount-time, super-block is locked */
+unsigned long ext4_count_dirs (struct super_block * sb)
+{
+	unsigned long count = 0;
+	int i;
+
+	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+		struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL);
+		if (!gdp)
+			continue;
+		count += le16_to_cpu(gdp->bg_used_dirs_count);
+	}
+	return count;
+}
+
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
new file mode 100644
index 000000000000..0a60ec5a16db
--- /dev/null
+++ b/fs/ext4/inode.c
@@ -0,0 +1,3233 @@
+/*
+ *  linux/fs/ext4/inode.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Goal-directed block allocation by Stephen Tweedie
+ *	(sct@redhat.com), 1993, 1998
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *  64-bit file support on 64-bit platforms by Jakub Jelinek
+ *	(jj@sunsite.ms.mff.cuni.cz)
+ *
+ *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/jbd2.h>
+#include <linux/smp_lock.h>
+#include <linux/highuid.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/mpage.h>
+#include <linux/uio.h>
+#include <linux/bio.h>
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static int ext4_inode_is_fast_symlink(struct inode *inode)
+{
+	int ea_blocks = EXT4_I(inode)->i_file_acl ?
+		(inode->i_sb->s_blocksize >> 9) : 0;
+
+	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
+}
+
+/*
+ * The ext4 forget function must perform a revoke if we are freeing data
+ * which has been journaled.  Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+ *
+ * "bh" may be NULL: a metadata block may have been freed from memory
+ * but there may still be a record of it in the journal, and that record
+ * still needs to be revoked.
+ */
+int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
+			struct buffer_head *bh, ext4_fsblk_t blocknr)
+{
+	int err;
+
+	might_sleep();
+
+	BUFFER_TRACE(bh, "enter");
+
+	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
+		  "data mode %lx\n",
+		  bh, is_metadata, inode->i_mode,
+		  test_opt(inode->i_sb, DATA_FLAGS));
+
+	/* Never use the revoke function if we are doing full data
+	 * journaling: there is no need to, and a V1 superblock won't
+	 * support it.  Otherwise, only skip the revoke on un-journaled
+	 * data blocks. */
+
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+	    (!is_metadata && !ext4_should_journal_data(inode))) {
+		if (bh) {
+			BUFFER_TRACE(bh, "call jbd2_journal_forget");
+			return ext4_journal_forget(handle, bh);
+		}
+		return 0;
+	}
+
+	/*
+	 * data!=journal && (is_metadata || should_journal_data(inode))
+	 */
+	BUFFER_TRACE(bh, "call ext4_journal_revoke");
+	err = ext4_journal_revoke(handle, blocknr, bh);
+	if (err)
+		ext4_abort(inode->i_sb, __FUNCTION__,
+			   "error %d when attempting revoke", err);
+	BUFFER_TRACE(bh, "exit");
+	return err;
+}
+
+/*
+ * Work out how many blocks we need to proceed with the next chunk of a
+ * truncate transaction.
+ */
+static unsigned long blocks_for_truncate(struct inode *inode)
+{
+	unsigned long needed;
+
+	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
+
+	/* Give ourselves just enough room to cope with inodes in which
+	 * i_blocks is corrupt: we've seen disk corruptions in the past
+	 * which resulted in random data in an inode which looked enough
+	 * like a regular file for ext4 to try to delete it.  Things
+	 * will go a bit crazy if that happens, but at least we should
+	 * try not to panic the whole kernel. */
+	if (needed < 2)
+		needed = 2;
+
+	/* But we need to bound the transaction so we don't overflow the
+	 * journal. */
+	if (needed > EXT4_MAX_TRANS_DATA)
+		needed = EXT4_MAX_TRANS_DATA;
+
+	return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
+}
+
+/*
+ * Truncate transactions can be complex and absolutely huge.  So we need to
+ * be able to restart the transaction at a conventient checkpoint to make
+ * sure we don't overflow the journal.
+ *
+ * start_transaction gets us a new handle for a truncate transaction,
+ * and extend_transaction tries to extend the existing one a bit.  If
+ * extend fails, we need to propagate the failure up and restart the
+ * transaction in the top-level truncate loop. --sct
+ */
+static handle_t *start_transaction(struct inode *inode)
+{
+	handle_t *result;
+
+	result = ext4_journal_start(inode, blocks_for_truncate(inode));
+	if (!IS_ERR(result))
+		return result;
+
+	ext4_std_error(inode->i_sb, PTR_ERR(result));
+	return result;
+}
+
+/*
+ * Try to extend this transaction for the purposes of truncation.
+ *
+ * Returns 0 if we managed to create more room.  If we can't create more
+ * room, and the transaction must be restarted we return 1.
+ */
+static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+{
+	if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
+		return 0;
+	if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
+		return 0;
+	return 1;
+}
+
+/*
+ * Restart the transaction associated with *handle.  This does a commit,
+ * so before we call here everything must be consistently dirtied against
+ * this transaction.
+ */
+static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
+{
+	jbd_debug(2, "restarting handle %p\n", handle);
+	return ext4_journal_restart(handle, blocks_for_truncate(inode));
+}
+
+/*
+ * Called at the last iput() if i_nlink is zero.
+ */
+void ext4_delete_inode (struct inode * inode)
+{
+	handle_t *handle;
+
+	truncate_inode_pages(&inode->i_data, 0);
+
+	if (is_bad_inode(inode))
+		goto no_delete;
+
+	handle = start_transaction(inode);
+	if (IS_ERR(handle)) {
+		/*
+		 * If we're going to skip the normal cleanup, we still need to
+		 * make sure that the in-core orphan linked list is properly
+		 * cleaned up.
+		 */
+		ext4_orphan_del(NULL, inode);
+		goto no_delete;
+	}
+
+	if (IS_SYNC(inode))
+		handle->h_sync = 1;
+	inode->i_size = 0;
+	if (inode->i_blocks)
+		ext4_truncate(inode);
+	/*
+	 * Kill off the orphan record which ext4_truncate created.
+	 * AKPM: I think this can be inside the above `if'.
+	 * Note that ext4_orphan_del() has to be able to cope with the
+	 * deletion of a non-existent orphan - this is because we don't
+	 * know if ext4_truncate() actually created an orphan record.
+	 * (Well, we could do this if we need to, but heck - it works)
+	 */
+	ext4_orphan_del(handle, inode);
+	EXT4_I(inode)->i_dtime	= get_seconds();
+
+	/*
+	 * One subtle ordering requirement: if anything has gone wrong
+	 * (transaction abort, IO errors, whatever), then we can still
+	 * do these next steps (the fs will already have been marked as
+	 * having errors), but we can't free the inode if the mark_dirty
+	 * fails.
+	 */
+	if (ext4_mark_inode_dirty(handle, inode))
+		/* If that failed, just do the required in-core inode clear. */
+		clear_inode(inode);
+	else
+		ext4_free_inode(handle, inode);
+	ext4_journal_stop(handle);
+	return;
+no_delete:
+	clear_inode(inode);	/* We must guarantee clearing of inode... */
+}
+
+typedef struct {
+	__le32	*p;
+	__le32	key;
+	struct buffer_head *bh;
+} Indirect;
+
+static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
+{
+	p->key = *(p->p = v);
+	p->bh = bh;
+}
+
+static int verify_chain(Indirect *from, Indirect *to)
+{
+	while (from <= to && from->key == *from->p)
+		from++;
+	return (from > to);
+}
+
+/**
+ *	ext4_block_to_path - parse the block number into array of offsets
+ *	@inode: inode in question (we are only interested in its superblock)
+ *	@i_block: block number to be parsed
+ *	@offsets: array to store the offsets in
+ *      @boundary: set this non-zero if the referred-to block is likely to be
+ *             followed (on disk) by an indirect block.
+ *
+ *	To store the locations of file's data ext4 uses a data structure common
+ *	for UNIX filesystems - tree of pointers anchored in the inode, with
+ *	data blocks at leaves and indirect blocks in intermediate nodes.
+ *	This function translates the block number into path in that tree -
+ *	return value is the path length and @offsets[n] is the offset of
+ *	pointer to (n+1)th node in the nth one. If @block is out of range
+ *	(negative or too large) warning is printed and zero returned.
+ *
+ *	Note: function doesn't find node addresses, so no IO is needed. All
+ *	we need to know is the capacity of indirect blocks (taken from the
+ *	inode->i_sb).
+ */
+
+/*
+ * Portability note: the last comparison (check that we fit into triple
+ * indirect block) is spelled differently, because otherwise on an
+ * architecture with 32-bit longs and 8Kb pages we might get into trouble
+ * if our filesystem had 8Kb blocks. We might use long long, but that would
+ * kill us on x86. Oh, well, at least the sign propagation does not matter -
+ * i_block would have to be negative in the very beginning, so we would not
+ * get there at all.
+ */
+
+static int ext4_block_to_path(struct inode *inode,
+			long i_block, int offsets[4], int *boundary)
+{
+	int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+	int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
+	const long direct_blocks = EXT4_NDIR_BLOCKS,
+		indirect_blocks = ptrs,
+		double_blocks = (1 << (ptrs_bits * 2));
+	int n = 0;
+	int final = 0;
+
+	if (i_block < 0) {
+		ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");
+	} else if (i_block < direct_blocks) {
+		offsets[n++] = i_block;
+		final = direct_blocks;
+	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
+		offsets[n++] = EXT4_IND_BLOCK;
+		offsets[n++] = i_block;
+		final = ptrs;
+	} else if ((i_block -= indirect_blocks) < double_blocks) {
+		offsets[n++] = EXT4_DIND_BLOCK;
+		offsets[n++] = i_block >> ptrs_bits;
+		offsets[n++] = i_block & (ptrs - 1);
+		final = ptrs;
+	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+		offsets[n++] = EXT4_TIND_BLOCK;
+		offsets[n++] = i_block >> (ptrs_bits * 2);
+		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+		offsets[n++] = i_block & (ptrs - 1);
+		final = ptrs;
+	} else {
+		ext4_warning(inode->i_sb, "ext4_block_to_path", "block > big");
+	}
+	if (boundary)
+		*boundary = final - 1 - (i_block & (ptrs - 1));
+	return n;
+}
+
+/**
+ *	ext4_get_branch - read the chain of indirect blocks leading to data
+ *	@inode: inode in question
+ *	@depth: depth of the chain (1 - direct pointer, etc.)
+ *	@offsets: offsets of pointers in inode/indirect blocks
+ *	@chain: place to store the result
+ *	@err: here we store the error value
+ *
+ *	Function fills the array of triples <key, p, bh> and returns %NULL
+ *	if everything went OK or the pointer to the last filled triple
+ *	(incomplete one) otherwise. Upon the return chain[i].key contains
+ *	the number of (i+1)-th block in the chain (as it is stored in memory,
+ *	i.e. little-endian 32-bit), chain[i].p contains the address of that
+ *	number (it points into struct inode for i==0 and into the bh->b_data
+ *	for i>0) and chain[i].bh points to the buffer_head of i-th indirect
+ *	block for i>0 and NULL for i==0. In other words, it holds the block
+ *	numbers of the chain, addresses they were taken from (and where we can
+ *	verify that chain did not change) and buffer_heads hosting these
+ *	numbers.
+ *
+ *	Function stops when it stumbles upon zero pointer (absent block)
+ *		(pointer to last triple returned, *@err == 0)
+ *	or when it gets an IO error reading an indirect block
+ *		(ditto, *@err == -EIO)
+ *	or when it notices that chain had been changed while it was reading
+ *		(ditto, *@err == -EAGAIN)
+ *	or when it reads all @depth-1 indirect blocks successfully and finds
+ *	the whole chain, all way to the data (returns %NULL, *err == 0).
+ */
+static Indirect *ext4_get_branch(struct inode *inode, int depth, int *offsets,
+				 Indirect chain[4], int *err)
+{
+	struct super_block *sb = inode->i_sb;
+	Indirect *p = chain;
+	struct buffer_head *bh;
+
+	*err = 0;
+	/* i_data is not going away, no lock needed */
+	add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);
+	if (!p->key)
+		goto no_block;
+	while (--depth) {
+		bh = sb_bread(sb, le32_to_cpu(p->key));
+		if (!bh)
+			goto failure;
+		/* Reader: pointers */
+		if (!verify_chain(chain, p))
+			goto changed;
+		add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
+		/* Reader: end */
+		if (!p->key)
+			goto no_block;
+	}
+	return NULL;
+
+changed:
+	brelse(bh);
+	*err = -EAGAIN;
+	goto no_block;
+failure:
+	*err = -EIO;
+no_block:
+	return p;
+}
+
+/**
+ *	ext4_find_near - find a place for allocation with sufficient locality
+ *	@inode: owner
+ *	@ind: descriptor of indirect block.
+ *
+ *	This function returns the prefered place for block allocation.
+ *	It is used when heuristic for sequential allocation fails.
+ *	Rules are:
+ *	  + if there is a block to the left of our position - allocate near it.
+ *	  + if pointer will live in indirect block - allocate near that block.
+ *	  + if pointer will live in inode - allocate in the same
+ *	    cylinder group.
+ *
+ * In the latter case we colour the starting block by the callers PID to
+ * prevent it from clashing with concurrent allocations for a different inode
+ * in the same block group.   The PID is used here so that functionally related
+ * files will be close-by on-disk.
+ *
+ *	Caller must make sure that @ind is valid and will stay that way.
+ */
+static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	__le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
+	__le32 *p;
+	ext4_fsblk_t bg_start;
+	ext4_grpblk_t colour;
+
+	/* Try to find previous block */
+	for (p = ind->p - 1; p >= start; p--) {
+		if (*p)
+			return le32_to_cpu(*p);
+	}
+
+	/* No such thing, so let's try location of indirect block */
+	if (ind->bh)
+		return ind->bh->b_blocknr;
+
+	/*
+	 * It is going to be referred to from the inode itself? OK, just put it
+	 * into the same cylinder group then.
+	 */
+	bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
+	colour = (current->pid % 16) *
+			(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
+	return bg_start + colour;
+}
+
+/**
+ *	ext4_find_goal - find a prefered place for allocation.
+ *	@inode: owner
+ *	@block:  block we want
+ *	@chain:  chain of indirect blocks
+ *	@partial: pointer to the last triple within a chain
+ *	@goal:	place to store the result.
+ *
+ *	Normally this function find the prefered place for block allocation,
+ *	stores it in *@goal and returns zero.
+ */
+
+static ext4_fsblk_t ext4_find_goal(struct inode *inode, long block,
+		Indirect chain[4], Indirect *partial)
+{
+	struct ext4_block_alloc_info *block_i;
+
+	block_i =  EXT4_I(inode)->i_block_alloc_info;
+
+	/*
+	 * try the heuristic for sequential allocation,
+	 * failing that at least try to get decent locality.
+	 */
+	if (block_i && (block == block_i->last_alloc_logical_block + 1)
+		&& (block_i->last_alloc_physical_block != 0)) {
+		return block_i->last_alloc_physical_block + 1;
+	}
+
+	return ext4_find_near(inode, partial);
+}
+
+/**
+ *	ext4_blks_to_allocate: Look up the block map and count the number
+ *	of direct blocks need to be allocated for the given branch.
+ *
+ *	@branch: chain of indirect blocks
+ *	@k: number of blocks need for indirect blocks
+ *	@blks: number of data blocks to be mapped.
+ *	@blocks_to_boundary:  the offset in the indirect block
+ *
+ *	return the total number of blocks to be allocate, including the
+ *	direct and indirect blocks.
+ */
+static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
+		int blocks_to_boundary)
+{
+	unsigned long count = 0;
+
+	/*
+	 * Simple case, [t,d]Indirect block(s) has not allocated yet
+	 * then it's clear blocks on that path have not allocated
+	 */
+	if (k > 0) {
+		/* right now we don't handle cross boundary allocation */
+		if (blks < blocks_to_boundary + 1)
+			count += blks;
+		else
+			count += blocks_to_boundary + 1;
+		return count;
+	}
+
+	count++;
+	while (count < blks && count <= blocks_to_boundary &&
+		le32_to_cpu(*(branch[0].p + count)) == 0) {
+		count++;
+	}
+	return count;
+}
+
+/**
+ *	ext4_alloc_blocks: multiple allocate blocks needed for a branch
+ *	@indirect_blks: the number of blocks need to allocate for indirect
+ *			blocks
+ *
+ *	@new_blocks: on return it will store the new block numbers for
+ *	the indirect blocks(if needed) and the first direct block,
+ *	@blks:	on return it will store the total number of allocated
+ *		direct blocks
+ */
+static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
+			ext4_fsblk_t goal, int indirect_blks, int blks,
+			ext4_fsblk_t new_blocks[4], int *err)
+{
+	int target, i;
+	unsigned long count = 0;
+	int index = 0;
+	ext4_fsblk_t current_block = 0;
+	int ret = 0;
+
+	/*
+	 * Here we try to allocate the requested multiple blocks at once,
+	 * on a best-effort basis.
+	 * To build a branch, we should allocate blocks for
+	 * the indirect blocks(if not allocated yet), and at least
+	 * the first direct block of this branch.  That's the
+	 * minimum number of blocks need to allocate(required)
+	 */
+	target = blks + indirect_blks;
+
+	while (1) {
+		count = target;
+		/* allocating blocks for indirect blocks and direct blocks */
+		current_block = ext4_new_blocks(handle,inode,goal,&count,err);
+		if (*err)
+			goto failed_out;
+
+		target -= count;
+		/* allocate blocks for indirect blocks */
+		while (index < indirect_blks && count) {
+			new_blocks[index++] = current_block++;
+			count--;
+		}
+
+		if (count > 0)
+			break;
+	}
+
+	/* save the new block number for the first direct block */
+	new_blocks[index] = current_block;
+
+	/* total number of blocks allocated for direct blocks */
+	ret = count;
+	*err = 0;
+	return ret;
+failed_out:
+	for (i = 0; i <index; i++)
+		ext4_free_blocks(handle, inode, new_blocks[i], 1);
+	return ret;
+}
+
+/**
+ *	ext4_alloc_branch - allocate and set up a chain of blocks.
+ *	@inode: owner
+ *	@indirect_blks: number of allocated indirect blocks
+ *	@blks: number of allocated direct blocks
+ *	@offsets: offsets (in the blocks) to store the pointers to next.
+ *	@branch: place to store the chain in.
+ *
+ *	This function allocates blocks, zeroes out all but the last one,
+ *	links them into chain and (if we are synchronous) writes them to disk.
+ *	In other words, it prepares a branch that can be spliced onto the
+ *	inode. It stores the information about that chain in the branch[], in
+ *	the same format as ext4_get_branch() would do. We are calling it after
+ *	we had read the existing part of chain and partial points to the last
+ *	triple of that (one with zero ->key). Upon the exit we have the same
+ *	picture as after the successful ext4_get_block(), except that in one
+ *	place chain is disconnected - *branch->p is still zero (we did not
+ *	set the last link), but branch->key contains the number that should
+ *	be placed into *branch->p to fill that gap.
+ *
+ *	If allocation fails we free all blocks we've allocated (and forget
+ *	their buffer_heads) and return the error value the from failed
+ *	ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+ *	as described above and return 0.
+ */
+static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
+			int indirect_blks, int *blks, ext4_fsblk_t goal,
+			int *offsets, Indirect *branch)
+{
+	int blocksize = inode->i_sb->s_blocksize;
+	int i, n = 0;
+	int err = 0;
+	struct buffer_head *bh;
+	int num;
+	ext4_fsblk_t new_blocks[4];
+	ext4_fsblk_t current_block;
+
+	num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
+				*blks, new_blocks, &err);
+	if (err)
+		return err;
+
+	branch[0].key = cpu_to_le32(new_blocks[0]);
+	/*
+	 * metadata blocks and data blocks are allocated.
+	 */
+	for (n = 1; n <= indirect_blks;  n++) {
+		/*
+		 * Get buffer_head for parent block, zero it out
+		 * and set the pointer to new one, then send
+		 * parent to disk.
+		 */
+		bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+		branch[n].bh = bh;
+		lock_buffer(bh);
+		BUFFER_TRACE(bh, "call get_create_access");
+		err = ext4_journal_get_create_access(handle, bh);
+		if (err) {
+			unlock_buffer(bh);
+			brelse(bh);
+			goto failed;
+		}
+
+		memset(bh->b_data, 0, blocksize);
+		branch[n].p = (__le32 *) bh->b_data + offsets[n];
+		branch[n].key = cpu_to_le32(new_blocks[n]);
+		*branch[n].p = branch[n].key;
+		if ( n == indirect_blks) {
+			current_block = new_blocks[n];
+			/*
+			 * End of chain, update the last new metablock of
+			 * the chain to point to the new allocated
+			 * data blocks numbers
+			 */
+			for (i=1; i < num; i++)
+				*(branch[n].p + i) = cpu_to_le32(++current_block);
+		}
+		BUFFER_TRACE(bh, "marking uptodate");
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+
+		BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+		err = ext4_journal_dirty_metadata(handle, bh);
+		if (err)
+			goto failed;
+	}
+	*blks = num;
+	return err;
+failed:
+	/* Allocation failed, free what we already allocated */
+	for (i = 1; i <= n ; i++) {
+		BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
+		ext4_journal_forget(handle, branch[i].bh);
+	}
+	for (i = 0; i <indirect_blks; i++)
+		ext4_free_blocks(handle, inode, new_blocks[i], 1);
+
+	ext4_free_blocks(handle, inode, new_blocks[i], num);
+
+	return err;
+}
+
+/**
+ * ext4_splice_branch - splice the allocated branch onto inode.
+ * @inode: owner
+ * @block: (logical) number of block we are adding
+ * @chain: chain of indirect blocks (with a missing link - see
+ *	ext4_alloc_branch)
+ * @where: location of missing link
+ * @num:   number of indirect blocks we are adding
+ * @blks:  number of direct blocks we are adding
+ *
+ * This function fills the missing link and does all housekeeping needed in
+ * inode (->i_blocks, etc.). In case of success we end up with the full
+ * chain to new block and return 0.
+ */
+static int ext4_splice_branch(handle_t *handle, struct inode *inode,
+			long block, Indirect *where, int num, int blks)
+{
+	int i;
+	int err = 0;
+	struct ext4_block_alloc_info *block_i;
+	ext4_fsblk_t current_block;
+
+	block_i = EXT4_I(inode)->i_block_alloc_info;
+	/*
+	 * If we're splicing into a [td]indirect block (as opposed to the
+	 * inode) then we need to get write access to the [td]indirect block
+	 * before the splice.
+	 */
+	if (where->bh) {
+		BUFFER_TRACE(where->bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, where->bh);
+		if (err)
+			goto err_out;
+	}
+	/* That's it */
+
+	*where->p = where->key;
+
+	/*
+	 * Update the host buffer_head or inode to point to more just allocated
+	 * direct blocks blocks
+	 */
+	if (num == 0 && blks > 1) {
+		current_block = le32_to_cpu(where->key) + 1;
+		for (i = 1; i < blks; i++)
+			*(where->p + i ) = cpu_to_le32(current_block++);
+	}
+
+	/*
+	 * update the most recently allocated logical & physical block
+	 * in i_block_alloc_info, to assist find the proper goal block for next
+	 * allocation
+	 */
+	if (block_i) {
+		block_i->last_alloc_logical_block = block + blks - 1;
+		block_i->last_alloc_physical_block =
+				le32_to_cpu(where[num].key) + blks - 1;
+	}
+
+	/* We are done with atomic stuff, now do the rest of housekeeping */
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	ext4_mark_inode_dirty(handle, inode);
+
+	/* had we spliced it onto indirect block? */
+	if (where->bh) {
+		/*
+		 * If we spliced it onto an indirect block, we haven't
+		 * altered the inode.  Note however that if it is being spliced
+		 * onto an indirect block at the very end of the file (the
+		 * file is growing) then we *will* alter the inode to reflect
+		 * the new i_size.  But that is not done here - it is done in
+		 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
+		 */
+		jbd_debug(5, "splicing indirect only\n");
+		BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");
+		err = ext4_journal_dirty_metadata(handle, where->bh);
+		if (err)
+			goto err_out;
+	} else {
+		/*
+		 * OK, we spliced it into the inode itself on a direct block.
+		 * Inode was dirtied above.
+		 */
+		jbd_debug(5, "splicing direct\n");
+	}
+	return err;
+
+err_out:
+	for (i = 1; i <= num; i++) {
+		BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
+		ext4_journal_forget(handle, where[i].bh);
+		ext4_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
+	}
+	ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
+
+	return err;
+}
+
+/*
+ * Allocation strategy is simple: if we have to allocate something, we will
+ * have to go the whole way to leaf. So let's do it before attaching anything
+ * to tree, set linkage between the newborn blocks, write them if sync is
+ * required, recheck the path, free and repeat if check fails, otherwise
+ * set the last missing link (that will protect us from any truncate-generated
+ * removals - all blocks on the path are immune now) and possibly force the
+ * write on the parent block.
+ * That has a nice additional property: no special recovery from the failed
+ * allocations is needed - we simply release blocks and do not touch anything
+ * reachable from inode.
+ *
+ * `handle' can be NULL if create == 0.
+ *
+ * The BKL may not be held on entry here.  Be sure to take it early.
+ * return > 0, # of blocks mapped or allocated.
+ * return = 0, if plain lookup failed.
+ * return < 0, error case.
+ */
+int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
+		sector_t iblock, unsigned long maxblocks,
+		struct buffer_head *bh_result,
+		int create, int extend_disksize)
+{
+	int err = -EIO;
+	int offsets[4];
+	Indirect chain[4];
+	Indirect *partial;
+	ext4_fsblk_t goal;
+	int indirect_blks;
+	int blocks_to_boundary = 0;
+	int depth;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	int count = 0;
+	ext4_fsblk_t first_block = 0;
+
+
+	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
+	J_ASSERT(handle != NULL || create == 0);
+	depth = ext4_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
+
+	if (depth == 0)
+		goto out;
+
+	partial = ext4_get_branch(inode, depth, offsets, chain, &err);
+
+	/* Simplest case - block found, no allocation needed */
+	if (!partial) {
+		first_block = le32_to_cpu(chain[depth - 1].key);
+		clear_buffer_new(bh_result);
+		count++;
+		/*map more blocks*/
+		while (count < maxblocks && count <= blocks_to_boundary) {
+			ext4_fsblk_t blk;
+
+			if (!verify_chain(chain, partial)) {
+				/*
+				 * Indirect block might be removed by
+				 * truncate while we were reading it.
+				 * Handling of that case: forget what we've
+				 * got now. Flag the err as EAGAIN, so it
+				 * will reread.
+				 */
+				err = -EAGAIN;
+				count = 0;
+				break;
+			}
+			blk = le32_to_cpu(*(chain[depth-1].p + count));
+
+			if (blk == first_block + count)
+				count++;
+			else
+				break;
+		}
+		if (err != -EAGAIN)
+			goto got_it;
+	}
+
+	/* Next simple case - plain lookup or failed read of indirect block */
+	if (!create || err == -EIO)
+		goto cleanup;
+
+	mutex_lock(&ei->truncate_mutex);
+
+	/*
+	 * If the indirect block is missing while we are reading
+	 * the chain(ext4_get_branch() returns -EAGAIN err), or
+	 * if the chain has been changed after we grab the semaphore,
+	 * (either because another process truncated this branch, or
+	 * another get_block allocated this branch) re-grab the chain to see if
+	 * the request block has been allocated or not.
+	 *
+	 * Since we already block the truncate/other get_block
+	 * at this point, we will have the current copy of the chain when we
+	 * splice the branch into the tree.
+	 */
+	if (err == -EAGAIN || !verify_chain(chain, partial)) {
+		while (partial > chain) {
+			brelse(partial->bh);
+			partial--;
+		}
+		partial = ext4_get_branch(inode, depth, offsets, chain, &err);
+		if (!partial) {
+			count++;
+			mutex_unlock(&ei->truncate_mutex);
+			if (err)
+				goto cleanup;
+			clear_buffer_new(bh_result);
+			goto got_it;
+		}
+	}
+
+	/*
+	 * Okay, we need to do block allocation.  Lazily initialize the block
+	 * allocation info here if necessary
+	*/
+	if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
+		ext4_init_block_alloc_info(inode);
+
+	goal = ext4_find_goal(inode, iblock, chain, partial);
+
+	/* the number of blocks need to allocate for [d,t]indirect blocks */
+	indirect_blks = (chain + depth) - partial - 1;
+
+	/*
+	 * Next look up the indirect map to count the totoal number of
+	 * direct blocks to allocate for this branch.
+	 */
+	count = ext4_blks_to_allocate(partial, indirect_blks,
+					maxblocks, blocks_to_boundary);
+	/*
+	 * Block out ext4_truncate while we alter the tree
+	 */
+	err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
+				offsets + (partial - chain), partial);
+
+	/*
+	 * The ext4_splice_branch call will free and forget any buffers
+	 * on the new chain if there is a failure, but that risks using
+	 * up transaction credits, especially for bitmaps where the
+	 * credits cannot be returned.  Can we handle this somehow?  We
+	 * may need to return -EAGAIN upwards in the worst case.  --sct
+	 */
+	if (!err)
+		err = ext4_splice_branch(handle, inode, iblock,
+					partial, indirect_blks, count);
+	/*
+	 * i_disksize growing is protected by truncate_mutex.  Don't forget to
+	 * protect it if you're about to implement concurrent
+	 * ext4_get_block() -bzzz
+	*/
+	if (!err && extend_disksize && inode->i_size > ei->i_disksize)
+		ei->i_disksize = inode->i_size;
+	mutex_unlock(&ei->truncate_mutex);
+	if (err)
+		goto cleanup;
+
+	set_buffer_new(bh_result);
+got_it:
+	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+	if (count > blocks_to_boundary)
+		set_buffer_boundary(bh_result);
+	err = count;
+	/* Clean up and exit */
+	partial = chain + depth - 1;	/* the whole chain */
+cleanup:
+	while (partial > chain) {
+		BUFFER_TRACE(partial->bh, "call brelse");
+		brelse(partial->bh);
+		partial--;
+	}
+	BUFFER_TRACE(bh_result, "returned");
+out:
+	return err;
+}
+
+#define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32)
+
+static int ext4_get_block(struct inode *inode, sector_t iblock,
+			struct buffer_head *bh_result, int create)
+{
+	handle_t *handle = journal_current_handle();
+	int ret = 0;
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+
+	if (!create)
+		goto get_block;		/* A read */
+
+	if (max_blocks == 1)
+		goto get_block;		/* A single block get */
+
+	if (handle->h_transaction->t_state == T_LOCKED) {
+		/*
+		 * Huge direct-io writes can hold off commits for long
+		 * periods of time.  Let this commit run.
+		 */
+		ext4_journal_stop(handle);
+		handle = ext4_journal_start(inode, DIO_CREDITS);
+		if (IS_ERR(handle))
+			ret = PTR_ERR(handle);
+		goto get_block;
+	}
+
+	if (handle->h_buffer_credits <= EXT4_RESERVE_TRANS_BLOCKS) {
+		/*
+		 * Getting low on buffer credits...
+		 */
+		ret = ext4_journal_extend(handle, DIO_CREDITS);
+		if (ret > 0) {
+			/*
+			 * Couldn't extend the transaction.  Start a new one.
+			 */
+			ret = ext4_journal_restart(handle, DIO_CREDITS);
+		}
+	}
+
+get_block:
+	if (ret == 0) {
+		ret = ext4_get_blocks_wrap(handle, inode, iblock,
+					max_blocks, bh_result, create, 0);
+		if (ret > 0) {
+			bh_result->b_size = (ret << inode->i_blkbits);
+			ret = 0;
+		}
+	}
+	return ret;
+}
+
+/*
+ * `handle' can be NULL if create is zero
+ */
+struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
+				long block, int create, int *errp)
+{
+	struct buffer_head dummy;
+	int fatal = 0, err;
+
+	J_ASSERT(handle != NULL || create == 0);
+
+	dummy.b_state = 0;
+	dummy.b_blocknr = -1000;
+	buffer_trace_init(&dummy.b_history);
+	err = ext4_get_blocks_wrap(handle, inode, block, 1,
+					&dummy, create, 1);
+	/*
+	 * ext4_get_blocks_handle() returns number of blocks
+	 * mapped. 0 in case of a HOLE.
+	 */
+	if (err > 0) {
+		if (err > 1)
+			WARN_ON(1);
+		err = 0;
+	}
+	*errp = err;
+	if (!err && buffer_mapped(&dummy)) {
+		struct buffer_head *bh;
+		bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
+		if (!bh) {
+			*errp = -EIO;
+			goto err;
+		}
+		if (buffer_new(&dummy)) {
+			J_ASSERT(create != 0);
+			J_ASSERT(handle != 0);
+
+			/*
+			 * Now that we do not always journal data, we should
+			 * keep in mind whether this should always journal the
+			 * new buffer as metadata.  For now, regular file
+			 * writes use ext4_get_block instead, so it's not a
+			 * problem.
+			 */
+			lock_buffer(bh);
+			BUFFER_TRACE(bh, "call get_create_access");
+			fatal = ext4_journal_get_create_access(handle, bh);
+			if (!fatal && !buffer_uptodate(bh)) {
+				memset(bh->b_data,0,inode->i_sb->s_blocksize);
+				set_buffer_uptodate(bh);
+			}
+			unlock_buffer(bh);
+			BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+			err = ext4_journal_dirty_metadata(handle, bh);
+			if (!fatal)
+				fatal = err;
+		} else {
+			BUFFER_TRACE(bh, "not a new buffer");
+		}
+		if (fatal) {
+			*errp = fatal;
+			brelse(bh);
+			bh = NULL;
+		}
+		return bh;
+	}
+err:
+	return NULL;
+}
+
+struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
+			       int block, int create, int *err)
+{
+	struct buffer_head * bh;
+
+	bh = ext4_getblk(handle, inode, block, create, err);
+	if (!bh)
+		return bh;
+	if (buffer_uptodate(bh))
+		return bh;
+	ll_rw_block(READ_META, 1, &bh);
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		return bh;
+	put_bh(bh);
+	*err = -EIO;
+	return NULL;
+}
+
+static int walk_page_buffers(	handle_t *handle,
+				struct buffer_head *head,
+				unsigned from,
+				unsigned to,
+				int *partial,
+				int (*fn)(	handle_t *handle,
+						struct buffer_head *bh))
+{
+	struct buffer_head *bh;
+	unsigned block_start, block_end;
+	unsigned blocksize = head->b_size;
+	int err, ret = 0;
+	struct buffer_head *next;
+
+	for (	bh = head, block_start = 0;
+		ret == 0 && (bh != head || !block_start);
+		block_start = block_end, bh = next)
+	{
+		next = bh->b_this_page;
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (partial && !buffer_uptodate(bh))
+				*partial = 1;
+			continue;
+		}
+		err = (*fn)(handle, bh);
+		if (!ret)
+			ret = err;
+	}
+	return ret;
+}
+
+/*
+ * To preserve ordering, it is essential that the hole instantiation and
+ * the data write be encapsulated in a single transaction.  We cannot
+ * close off a transaction and start a new one between the ext4_get_block()
+ * and the commit_write().  So doing the jbd2_journal_start at the start of
+ * prepare_write() is the right place.
+ *
+ * Also, this function can nest inside ext4_writepage() ->
+ * block_write_full_page(). In that case, we *know* that ext4_writepage()
+ * has generated enough buffer credits to do the whole page.  So we won't
+ * block on the journal in that case, which is good, because the caller may
+ * be PF_MEMALLOC.
+ *
+ * By accident, ext4 can be reentered when a transaction is open via
+ * quota file writes.  If we were to commit the transaction while thus
+ * reentered, there can be a deadlock - we would be holding a quota
+ * lock, and the commit would never complete if another thread had a
+ * transaction open and was blocking on the quota lock - a ranking
+ * violation.
+ *
+ * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
+ * will _not_ run commit under these circumstances because handle->h_ref
+ * is elevated.  We'll still have enough credits for the tiny quotafile
+ * write.
+ */
+static int do_journal_get_write_access(handle_t *handle,
+					struct buffer_head *bh)
+{
+	if (!buffer_mapped(bh) || buffer_freed(bh))
+		return 0;
+	return ext4_journal_get_write_access(handle, bh);
+}
+
+static int ext4_prepare_write(struct file *file, struct page *page,
+			      unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+	handle_t *handle;
+	int retries = 0;
+
+retry:
+	handle = ext4_journal_start(inode, needed_blocks);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+		ret = nobh_prepare_write(page, from, to, ext4_get_block);
+	else
+		ret = block_prepare_write(page, from, to, ext4_get_block);
+	if (ret)
+		goto prepare_write_failed;
+
+	if (ext4_should_journal_data(inode)) {
+		ret = walk_page_buffers(handle, page_buffers(page),
+				from, to, NULL, do_journal_get_write_access);
+	}
+prepare_write_failed:
+	if (ret)
+		ext4_journal_stop(handle);
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+out:
+	return ret;
+}
+
+int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
+{
+	int err = jbd2_journal_dirty_data(handle, bh);
+	if (err)
+		ext4_journal_abort_handle(__FUNCTION__, __FUNCTION__,
+						bh, handle,err);
+	return err;
+}
+
+/* For commit_write() in data=journal mode */
+static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
+{
+	if (!buffer_mapped(bh) || buffer_freed(bh))
+		return 0;
+	set_buffer_uptodate(bh);
+	return ext4_journal_dirty_metadata(handle, bh);
+}
+
+/*
+ * We need to pick up the new inode size which generic_commit_write gave us
+ * `file' can be NULL - eg, when called from page_symlink().
+ *
+ * ext4 never places buffers on inode->i_mapping->private_list.  metadata
+ * buffers are managed internally.
+ */
+static int ext4_ordered_commit_write(struct file *file, struct page *page,
+			     unsigned from, unsigned to)
+{
+	handle_t *handle = ext4_journal_current_handle();
+	struct inode *inode = page->mapping->host;
+	int ret = 0, ret2;
+
+	ret = walk_page_buffers(handle, page_buffers(page),
+		from, to, NULL, ext4_journal_dirty_data);
+
+	if (ret == 0) {
+		/*
+		 * generic_commit_write() will run mark_inode_dirty() if i_size
+		 * changes.  So let's piggyback the i_disksize mark_inode_dirty
+		 * into that.
+		 */
+		loff_t new_i_size;
+
+		new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+		if (new_i_size > EXT4_I(inode)->i_disksize)
+			EXT4_I(inode)->i_disksize = new_i_size;
+		ret = generic_commit_write(file, page, from, to);
+	}
+	ret2 = ext4_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+	return ret;
+}
+
+static int ext4_writeback_commit_write(struct file *file, struct page *page,
+			     unsigned from, unsigned to)
+{
+	handle_t *handle = ext4_journal_current_handle();
+	struct inode *inode = page->mapping->host;
+	int ret = 0, ret2;
+	loff_t new_i_size;
+
+	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	if (new_i_size > EXT4_I(inode)->i_disksize)
+		EXT4_I(inode)->i_disksize = new_i_size;
+
+	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+		ret = nobh_commit_write(file, page, from, to);
+	else
+		ret = generic_commit_write(file, page, from, to);
+
+	ret2 = ext4_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+	return ret;
+}
+
+static int ext4_journalled_commit_write(struct file *file,
+			struct page *page, unsigned from, unsigned to)
+{
+	handle_t *handle = ext4_journal_current_handle();
+	struct inode *inode = page->mapping->host;
+	int ret = 0, ret2;
+	int partial = 0;
+	loff_t pos;
+
+	/*
+	 * Here we duplicate the generic_commit_write() functionality
+	 */
+	pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+	ret = walk_page_buffers(handle, page_buffers(page), from,
+				to, &partial, commit_write_fn);
+	if (!partial)
+		SetPageUptodate(page);
+	if (pos > inode->i_size)
+		i_size_write(inode, pos);
+	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+	if (inode->i_size > EXT4_I(inode)->i_disksize) {
+		EXT4_I(inode)->i_disksize = inode->i_size;
+		ret2 = ext4_mark_inode_dirty(handle, inode);
+		if (!ret)
+			ret = ret2;
+	}
+	ret2 = ext4_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+	return ret;
+}
+
+/*
+ * bmap() is special.  It gets used by applications such as lilo and by
+ * the swapper to find the on-disk block of a specific piece of data.
+ *
+ * Naturally, this is dangerous if the block concerned is still in the
+ * journal.  If somebody makes a swapfile on an ext4 data-journaling
+ * filesystem and enables swap, then they may get a nasty shock when the
+ * data getting swapped to that swapfile suddenly gets overwritten by
+ * the original zero's written out previously to the journal and
+ * awaiting writeback in the kernel's buffer cache.
+ *
+ * So, if we see any bmap calls here on a modified, data-journaled file,
+ * take extra steps to flush any blocks which might be in the cache.
+ */
+static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
+{
+	struct inode *inode = mapping->host;
+	journal_t *journal;
+	int err;
+
+	if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+		/*
+		 * This is a REALLY heavyweight approach, but the use of
+		 * bmap on dirty files is expected to be extremely rare:
+		 * only if we run lilo or swapon on a freshly made file
+		 * do we expect this to happen.
+		 *
+		 * (bmap requires CAP_SYS_RAWIO so this does not
+		 * represent an unprivileged user DOS attack --- we'd be
+		 * in trouble if mortal users could trigger this path at
+		 * will.)
+		 *
+		 * NB. EXT4_STATE_JDATA is not set on files other than
+		 * regular files.  If somebody wants to bmap a directory
+		 * or symlink and gets confused because the buffer
+		 * hasn't yet been flushed to disk, they deserve
+		 * everything they get.
+		 */
+
+		EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
+		journal = EXT4_JOURNAL(inode);
+		jbd2_journal_lock_updates(journal);
+		err = jbd2_journal_flush(journal);
+		jbd2_journal_unlock_updates(journal);
+
+		if (err)
+			return 0;
+	}
+
+	return generic_block_bmap(mapping,block,ext4_get_block);
+}
+
+static int bget_one(handle_t *handle, struct buffer_head *bh)
+{
+	get_bh(bh);
+	return 0;
+}
+
+static int bput_one(handle_t *handle, struct buffer_head *bh)
+{
+	put_bh(bh);
+	return 0;
+}
+
+static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
+{
+	if (buffer_mapped(bh))
+		return ext4_journal_dirty_data(handle, bh);
+	return 0;
+}
+
+/*
+ * Note that we always start a transaction even if we're not journalling
+ * data.  This is to preserve ordering: any hole instantiation within
+ * __block_write_full_page -> ext4_get_block() should be journalled
+ * along with the data so we don't crash and then get metadata which
+ * refers to old data.
+ *
+ * In all journalling modes block_write_full_page() will start the I/O.
+ *
+ * Problem:
+ *
+ *	ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
+ *		ext4_writepage()
+ *
+ * Similar for:
+ *
+ *	ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ...
+ *
+ * Same applies to ext4_get_block().  We will deadlock on various things like
+ * lock_journal and i_truncate_mutex.
+ *
+ * Setting PF_MEMALLOC here doesn't work - too many internal memory
+ * allocations fail.
+ *
+ * 16May01: If we're reentered then journal_current_handle() will be
+ *	    non-zero. We simply *return*.
+ *
+ * 1 July 2001: @@@ FIXME:
+ *   In journalled data mode, a data buffer may be metadata against the
+ *   current transaction.  But the same file is part of a shared mapping
+ *   and someone does a writepage() on it.
+ *
+ *   We will move the buffer onto the async_data list, but *after* it has
+ *   been dirtied. So there's a small window where we have dirty data on
+ *   BJ_Metadata.
+ *
+ *   Note that this only applies to the last partial page in the file.  The
+ *   bit which block_write_full_page() uses prepare/commit for.  (That's
+ *   broken code anyway: it's wrong for msync()).
+ *
+ *   It's a rare case: affects the final partial page, for journalled data
+ *   where the file is subject to bith write() and writepage() in the same
+ *   transction.  To fix it we'll need a custom block_write_full_page().
+ *   We'll probably need that anyway for journalling writepage() output.
+ *
+ * We don't honour synchronous mounts for writepage().  That would be
+ * disastrous.  Any write() or metadata operation will sync the fs for
+ * us.
+ *
+ * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
+ * we don't need to open a transaction here.
+ */
+static int ext4_ordered_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct buffer_head *page_bufs;
+	handle_t *handle = NULL;
+	int ret = 0;
+	int err;
+
+	J_ASSERT(PageLocked(page));
+
+	/*
+	 * We give up here if we're reentered, because it might be for a
+	 * different filesystem.
+	 */
+	if (ext4_journal_current_handle())
+		goto out_fail;
+
+	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out_fail;
+	}
+
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, inode->i_sb->s_blocksize,
+				(1 << BH_Dirty)|(1 << BH_Uptodate));
+	}
+	page_bufs = page_buffers(page);
+	walk_page_buffers(handle, page_bufs, 0,
+			PAGE_CACHE_SIZE, NULL, bget_one);
+
+	ret = block_write_full_page(page, ext4_get_block, wbc);
+
+	/*
+	 * The page can become unlocked at any point now, and
+	 * truncate can then come in and change things.  So we
+	 * can't touch *page from now on.  But *page_bufs is
+	 * safe due to elevated refcount.
+	 */
+
+	/*
+	 * And attach them to the current transaction.  But only if
+	 * block_write_full_page() succeeded.  Otherwise they are unmapped,
+	 * and generally junk.
+	 */
+	if (ret == 0) {
+		err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
+					NULL, jbd2_journal_dirty_data_fn);
+		if (!ret)
+			ret = err;
+	}
+	walk_page_buffers(handle, page_bufs, 0,
+			PAGE_CACHE_SIZE, NULL, bput_one);
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+
+out_fail:
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+	return ret;
+}
+
+static int ext4_writeback_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	handle_t *handle = NULL;
+	int ret = 0;
+	int err;
+
+	if (ext4_journal_current_handle())
+		goto out_fail;
+
+	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out_fail;
+	}
+
+	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+		ret = nobh_writepage(page, ext4_get_block, wbc);
+	else
+		ret = block_write_full_page(page, ext4_get_block, wbc);
+
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+
+out_fail:
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+	return ret;
+}
+
+static int ext4_journalled_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	handle_t *handle = NULL;
+	int ret = 0;
+	int err;
+
+	if (ext4_journal_current_handle())
+		goto no_write;
+
+	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto no_write;
+	}
+
+	if (!page_has_buffers(page) || PageChecked(page)) {
+		/*
+		 * It's mmapped pagecache.  Add buffers and journal it.  There
+		 * doesn't seem much point in redirtying the page here.
+		 */
+		ClearPageChecked(page);
+		ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+					ext4_get_block);
+		if (ret != 0) {
+			ext4_journal_stop(handle);
+			goto out_unlock;
+		}
+		ret = walk_page_buffers(handle, page_buffers(page), 0,
+			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
+
+		err = walk_page_buffers(handle, page_buffers(page), 0,
+				PAGE_CACHE_SIZE, NULL, commit_write_fn);
+		if (ret == 0)
+			ret = err;
+		EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+		unlock_page(page);
+	} else {
+		/*
+		 * It may be a page full of checkpoint-mode buffers.  We don't
+		 * really know unless we go poke around in the buffer_heads.
+		 * But block_write_full_page will do the right thing.
+		 */
+		ret = block_write_full_page(page, ext4_get_block, wbc);
+	}
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+out:
+	return ret;
+
+no_write:
+	redirty_page_for_writepage(wbc, page);
+out_unlock:
+	unlock_page(page);
+	goto out;
+}
+
+static int ext4_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, ext4_get_block);
+}
+
+static int
+ext4_readpages(struct file *file, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
+}
+
+static void ext4_invalidatepage(struct page *page, unsigned long offset)
+{
+	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+
+	/*
+	 * If it's a full truncate we just forget about the pending dirtying
+	 */
+	if (offset == 0)
+		ClearPageChecked(page);
+
+	jbd2_journal_invalidatepage(journal, page, offset);
+}
+
+static int ext4_releasepage(struct page *page, gfp_t wait)
+{
+	journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+
+	WARN_ON(PageChecked(page));
+	if (!page_has_buffers(page))
+		return 0;
+	return jbd2_journal_try_to_free_buffers(journal, page, wait);
+}
+
+/*
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list.  So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ * If the O_DIRECT write is intantiating holes inside i_size and the machine
+ * crashes then stale disk data _may_ be exposed inside the file.
+ */
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+			const struct iovec *iov, loff_t offset,
+			unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	handle_t *handle = NULL;
+	ssize_t ret;
+	int orphan = 0;
+	size_t count = iov_length(iov, nr_segs);
+
+	if (rw == WRITE) {
+		loff_t final_size = offset + count;
+
+		handle = ext4_journal_start(inode, DIO_CREDITS);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out;
+		}
+		if (final_size > inode->i_size) {
+			ret = ext4_orphan_add(handle, inode);
+			if (ret)
+				goto out_stop;
+			orphan = 1;
+			ei->i_disksize = inode->i_size;
+		}
+	}
+
+	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				 offset, nr_segs,
+				 ext4_get_block, NULL);
+
+	/*
+	 * Reacquire the handle: ext4_get_block() can restart the transaction
+	 */
+	handle = journal_current_handle();
+
+out_stop:
+	if (handle) {
+		int err;
+
+		if (orphan && inode->i_nlink)
+			ext4_orphan_del(handle, inode);
+		if (orphan && ret > 0) {
+			loff_t end = offset + ret;
+			if (end > inode->i_size) {
+				ei->i_disksize = end;
+				i_size_write(inode, end);
+				/*
+				 * We're going to return a positive `ret'
+				 * here due to non-zero-length I/O, so there's
+				 * no way of reporting error returns from
+				 * ext4_mark_inode_dirty() to userspace.  So
+				 * ignore it.
+				 */
+				ext4_mark_inode_dirty(handle, inode);
+			}
+		}
+		err = ext4_journal_stop(handle);
+		if (ret == 0)
+			ret = err;
+	}
+out:
+	return ret;
+}
+
+/*
+ * Pages can be marked dirty completely asynchronously from ext4's journalling
+ * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
+ * much here because ->set_page_dirty is called under VFS locks.  The page is
+ * not necessarily locked.
+ *
+ * We cannot just dirty the page and leave attached buffers clean, because the
+ * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
+ * or jbddirty because all the journalling code will explode.
+ *
+ * So what we do is to mark the page "pending dirty" and next time writepage
+ * is called, propagate that into the buffers appropriately.
+ */
+static int ext4_journalled_set_page_dirty(struct page *page)
+{
+	SetPageChecked(page);
+	return __set_page_dirty_nobuffers(page);
+}
+
+static const struct address_space_operations ext4_ordered_aops = {
+	.readpage	= ext4_readpage,
+	.readpages	= ext4_readpages,
+	.writepage	= ext4_ordered_writepage,
+	.sync_page	= block_sync_page,
+	.prepare_write	= ext4_prepare_write,
+	.commit_write	= ext4_ordered_commit_write,
+	.bmap		= ext4_bmap,
+	.invalidatepage	= ext4_invalidatepage,
+	.releasepage	= ext4_releasepage,
+	.direct_IO	= ext4_direct_IO,
+	.migratepage	= buffer_migrate_page,
+};
+
+static const struct address_space_operations ext4_writeback_aops = {
+	.readpage	= ext4_readpage,
+	.readpages	= ext4_readpages,
+	.writepage	= ext4_writeback_writepage,
+	.sync_page	= block_sync_page,
+	.prepare_write	= ext4_prepare_write,
+	.commit_write	= ext4_writeback_commit_write,
+	.bmap		= ext4_bmap,
+	.invalidatepage	= ext4_invalidatepage,
+	.releasepage	= ext4_releasepage,
+	.direct_IO	= ext4_direct_IO,
+	.migratepage	= buffer_migrate_page,
+};
+
+static const struct address_space_operations ext4_journalled_aops = {
+	.readpage	= ext4_readpage,
+	.readpages	= ext4_readpages,
+	.writepage	= ext4_journalled_writepage,
+	.sync_page	= block_sync_page,
+	.prepare_write	= ext4_prepare_write,
+	.commit_write	= ext4_journalled_commit_write,
+	.set_page_dirty	= ext4_journalled_set_page_dirty,
+	.bmap		= ext4_bmap,
+	.invalidatepage	= ext4_invalidatepage,
+	.releasepage	= ext4_releasepage,
+};
+
+void ext4_set_aops(struct inode *inode)
+{
+	if (ext4_should_order_data(inode))
+		inode->i_mapping->a_ops = &ext4_ordered_aops;
+	else if (ext4_should_writeback_data(inode))
+		inode->i_mapping->a_ops = &ext4_writeback_aops;
+	else
+		inode->i_mapping->a_ops = &ext4_journalled_aops;
+}
+
+/*
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+int ext4_block_truncate_page(handle_t *handle, struct page *page,
+		struct address_space *mapping, loff_t from)
+{
+	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize, iblock, length, pos;
+	struct inode *inode = mapping->host;
+	struct buffer_head *bh;
+	int err = 0;
+	void *kaddr;
+
+	blocksize = inode->i_sb->s_blocksize;
+	length = blocksize - (offset & (blocksize - 1));
+	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+	/*
+	 * For "nobh" option,  we can only work if we don't need to
+	 * read-in the page - otherwise we create buffers to do the IO.
+	 */
+	if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
+	     ext4_should_writeback_data(inode) && PageUptodate(page)) {
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr + offset, 0, length);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
+		set_page_dirty(page);
+		goto unlock;
+	}
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+
+	/* Find the buffer that contains "offset" */
+	bh = page_buffers(page);
+	pos = blocksize;
+	while (offset >= pos) {
+		bh = bh->b_this_page;
+		iblock++;
+		pos += blocksize;
+	}
+
+	err = 0;
+	if (buffer_freed(bh)) {
+		BUFFER_TRACE(bh, "freed: skip");
+		goto unlock;
+	}
+
+	if (!buffer_mapped(bh)) {
+		BUFFER_TRACE(bh, "unmapped");
+		ext4_get_block(inode, iblock, bh, 0);
+		/* unmapped? It's a hole - nothing to do */
+		if (!buffer_mapped(bh)) {
+			BUFFER_TRACE(bh, "still unmapped");
+			goto unlock;
+		}
+	}
+
+	/* Ok, it's mapped. Make sure it's up-to-date */
+	if (PageUptodate(page))
+		set_buffer_uptodate(bh);
+
+	if (!buffer_uptodate(bh)) {
+		err = -EIO;
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		/* Uhhuh. Read error. Complain and punt. */
+		if (!buffer_uptodate(bh))
+			goto unlock;
+	}
+
+	if (ext4_should_journal_data(inode)) {
+		BUFFER_TRACE(bh, "get write access");
+		err = ext4_journal_get_write_access(handle, bh);
+		if (err)
+			goto unlock;
+	}
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr + offset, 0, length);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	BUFFER_TRACE(bh, "zeroed end of block");
+
+	err = 0;
+	if (ext4_should_journal_data(inode)) {
+		err = ext4_journal_dirty_metadata(handle, bh);
+	} else {
+		if (ext4_should_order_data(inode))
+			err = ext4_journal_dirty_data(handle, bh);
+		mark_buffer_dirty(bh);
+	}
+
+unlock:
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
+/*
+ * Probably it should be a library function... search for first non-zero word
+ * or memcmp with zero_page, whatever is better for particular architecture.
+ * Linus?
+ */
+static inline int all_zeroes(__le32 *p, __le32 *q)
+{
+	while (p < q)
+		if (*p++)
+			return 0;
+	return 1;
+}
+
+/**
+ *	ext4_find_shared - find the indirect blocks for partial truncation.
+ *	@inode:	  inode in question
+ *	@depth:	  depth of the affected branch
+ *	@offsets: offsets of pointers in that branch (see ext4_block_to_path)
+ *	@chain:	  place to store the pointers to partial indirect blocks
+ *	@top:	  place to the (detached) top of branch
+ *
+ *	This is a helper function used by ext4_truncate().
+ *
+ *	When we do truncate() we may have to clean the ends of several
+ *	indirect blocks but leave the blocks themselves alive. Block is
+ *	partially truncated if some data below the new i_size is refered
+ *	from it (and it is on the path to the first completely truncated
+ *	data block, indeed).  We have to free the top of that path along
+ *	with everything to the right of the path. Since no allocation
+ *	past the truncation point is possible until ext4_truncate()
+ *	finishes, we may safely do the latter, but top of branch may
+ *	require special attention - pageout below the truncation point
+ *	might try to populate it.
+ *
+ *	We atomically detach the top of branch from the tree, store the
+ *	block number of its root in *@top, pointers to buffer_heads of
+ *	partially truncated blocks - in @chain[].bh and pointers to
+ *	their last elements that should not be removed - in
+ *	@chain[].p. Return value is the pointer to last filled element
+ *	of @chain.
+ *
+ *	The work left to caller to do the actual freeing of subtrees:
+ *		a) free the subtree starting from *@top
+ *		b) free the subtrees whose roots are stored in
+ *			(@chain[i].p+1 .. end of @chain[i].bh->b_data)
+ *		c) free the subtrees growing from the inode past the @chain[0].
+ *			(no partially truncated stuff there).  */
+
+static Indirect *ext4_find_shared(struct inode *inode, int depth,
+			int offsets[4], Indirect chain[4], __le32 *top)
+{
+	Indirect *partial, *p;
+	int k, err;
+
+	*top = 0;
+	/* Make k index the deepest non-null offest + 1 */
+	for (k = depth; k > 1 && !offsets[k-1]; k--)
+		;
+	partial = ext4_get_branch(inode, k, offsets, chain, &err);
+	/* Writer: pointers */
+	if (!partial)
+		partial = chain + k-1;
+	/*
+	 * If the branch acquired continuation since we've looked at it -
+	 * fine, it should all survive and (new) top doesn't belong to us.
+	 */
+	if (!partial->key && *partial->p)
+		/* Writer: end */
+		goto no_top;
+	for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
+		;
+	/*
+	 * OK, we've found the last block that must survive. The rest of our
+	 * branch should be detached before unlocking. However, if that rest
+	 * of branch is all ours and does not grow immediately from the inode
+	 * it's easier to cheat and just decrement partial->p.
+	 */
+	if (p == chain + k - 1 && p > chain) {
+		p->p--;
+	} else {
+		*top = *p->p;
+		/* Nope, don't do this in ext4.  Must leave the tree intact */
+#if 0
+		*p->p = 0;
+#endif
+	}
+	/* Writer: end */
+
+	while(partial > p) {
+		brelse(partial->bh);
+		partial--;
+	}
+no_top:
+	return partial;
+}
+
+/*
+ * Zero a number of block pointers in either an inode or an indirect block.
+ * If we restart the transaction we must again get write access to the
+ * indirect block for further modification.
+ *
+ * We release `count' blocks on disk, but (last - first) may be greater
+ * than `count' because there can be holes in there.
+ */
+static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
+		struct buffer_head *bh, ext4_fsblk_t block_to_free,
+		unsigned long count, __le32 *first, __le32 *last)
+{
+	__le32 *p;
+	if (try_to_extend_transaction(handle, inode)) {
+		if (bh) {
+			BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+			ext4_journal_dirty_metadata(handle, bh);
+		}
+		ext4_mark_inode_dirty(handle, inode);
+		ext4_journal_test_restart(handle, inode);
+		if (bh) {
+			BUFFER_TRACE(bh, "retaking write access");
+			ext4_journal_get_write_access(handle, bh);
+		}
+	}
+
+	/*
+	 * Any buffers which are on the journal will be in memory. We find
+	 * them on the hash table so jbd2_journal_revoke() will run jbd2_journal_forget()
+	 * on them.  We've already detached each block from the file, so
+	 * bforget() in jbd2_journal_forget() should be safe.
+	 *
+	 * AKPM: turn on bforget in jbd2_journal_forget()!!!
+	 */
+	for (p = first; p < last; p++) {
+		u32 nr = le32_to_cpu(*p);
+		if (nr) {
+			struct buffer_head *bh;
+
+			*p = 0;
+			bh = sb_find_get_block(inode->i_sb, nr);
+			ext4_forget(handle, 0, inode, bh, nr);
+		}
+	}
+
+	ext4_free_blocks(handle, inode, block_to_free, count);
+}
+
+/**
+ * ext4_free_data - free a list of data blocks
+ * @handle:	handle for this transaction
+ * @inode:	inode we are dealing with
+ * @this_bh:	indirect buffer_head which contains *@first and *@last
+ * @first:	array of block numbers
+ * @last:	points immediately past the end of array
+ *
+ * We are freeing all blocks refered from that array (numbers are stored as
+ * little-endian 32-bit) and updating @inode->i_blocks appropriately.
+ *
+ * We accumulate contiguous runs of blocks to free.  Conveniently, if these
+ * blocks are contiguous then releasing them at one time will only affect one
+ * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
+ * actually use a lot of journal space.
+ *
+ * @this_bh will be %NULL if @first and @last point into the inode's direct
+ * block pointers.
+ */
+static void ext4_free_data(handle_t *handle, struct inode *inode,
+			   struct buffer_head *this_bh,
+			   __le32 *first, __le32 *last)
+{
+	ext4_fsblk_t block_to_free = 0;    /* Starting block # of a run */
+	unsigned long count = 0;	    /* Number of blocks in the run */
+	__le32 *block_to_free_p = NULL;	    /* Pointer into inode/ind
+					       corresponding to
+					       block_to_free */
+	ext4_fsblk_t nr;		    /* Current block # */
+	__le32 *p;			    /* Pointer into inode/ind
+					       for current block */
+	int err;
+
+	if (this_bh) {				/* For indirect block */
+		BUFFER_TRACE(this_bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, this_bh);
+		/* Important: if we can't update the indirect pointers
+		 * to the blocks, we can't free them. */
+		if (err)
+			return;
+	}
+
+	for (p = first; p < last; p++) {
+		nr = le32_to_cpu(*p);
+		if (nr) {
+			/* accumulate blocks to free if they're contiguous */
+			if (count == 0) {
+				block_to_free = nr;
+				block_to_free_p = p;
+				count = 1;
+			} else if (nr == block_to_free + count) {
+				count++;
+			} else {
+				ext4_clear_blocks(handle, inode, this_bh,
+						  block_to_free,
+						  count, block_to_free_p, p);
+				block_to_free = nr;
+				block_to_free_p = p;
+				count = 1;
+			}
+		}
+	}
+
+	if (count > 0)
+		ext4_clear_blocks(handle, inode, this_bh, block_to_free,
+				  count, block_to_free_p, p);
+
+	if (this_bh) {
+		BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
+		ext4_journal_dirty_metadata(handle, this_bh);
+	}
+}
+
+/**
+ *	ext4_free_branches - free an array of branches
+ *	@handle: JBD handle for this transaction
+ *	@inode:	inode we are dealing with
+ *	@parent_bh: the buffer_head which contains *@first and *@last
+ *	@first:	array of block numbers
+ *	@last:	pointer immediately past the end of array
+ *	@depth:	depth of the branches to free
+ *
+ *	We are freeing all blocks refered from these branches (numbers are
+ *	stored as little-endian 32-bit) and updating @inode->i_blocks
+ *	appropriately.
+ */
+static void ext4_free_branches(handle_t *handle, struct inode *inode,
+			       struct buffer_head *parent_bh,
+			       __le32 *first, __le32 *last, int depth)
+{
+	ext4_fsblk_t nr;
+	__le32 *p;
+
+	if (is_handle_aborted(handle))
+		return;
+
+	if (depth--) {
+		struct buffer_head *bh;
+		int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+		p = last;
+		while (--p >= first) {
+			nr = le32_to_cpu(*p);
+			if (!nr)
+				continue;		/* A hole */
+
+			/* Go read the buffer for the next level down */
+			bh = sb_bread(inode->i_sb, nr);
+
+			/*
+			 * A read failure? Report error and clear slot
+			 * (should be rare).
+			 */
+			if (!bh) {
+				ext4_error(inode->i_sb, "ext4_free_branches",
+					   "Read failure, inode=%lu, block=%llu",
+					   inode->i_ino, nr);
+				continue;
+			}
+
+			/* This zaps the entire block.  Bottom up. */
+			BUFFER_TRACE(bh, "free child branches");
+			ext4_free_branches(handle, inode, bh,
+					   (__le32*)bh->b_data,
+					   (__le32*)bh->b_data + addr_per_block,
+					   depth);
+
+			/*
+			 * We've probably journalled the indirect block several
+			 * times during the truncate.  But it's no longer
+			 * needed and we now drop it from the transaction via
+			 * jbd2_journal_revoke().
+			 *
+			 * That's easy if it's exclusively part of this
+			 * transaction.  But if it's part of the committing
+			 * transaction then jbd2_journal_forget() will simply
+			 * brelse() it.  That means that if the underlying
+			 * block is reallocated in ext4_get_block(),
+			 * unmap_underlying_metadata() will find this block
+			 * and will try to get rid of it.  damn, damn.
+			 *
+			 * If this block has already been committed to the
+			 * journal, a revoke record will be written.  And
+			 * revoke records must be emitted *before* clearing
+			 * this block's bit in the bitmaps.
+			 */
+			ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
+
+			/*
+			 * Everything below this this pointer has been
+			 * released.  Now let this top-of-subtree go.
+			 *
+			 * We want the freeing of this indirect block to be
+			 * atomic in the journal with the updating of the
+			 * bitmap block which owns it.  So make some room in
+			 * the journal.
+			 *
+			 * We zero the parent pointer *after* freeing its
+			 * pointee in the bitmaps, so if extend_transaction()
+			 * for some reason fails to put the bitmap changes and
+			 * the release into the same transaction, recovery
+			 * will merely complain about releasing a free block,
+			 * rather than leaking blocks.
+			 */
+			if (is_handle_aborted(handle))
+				return;
+			if (try_to_extend_transaction(handle, inode)) {
+				ext4_mark_inode_dirty(handle, inode);
+				ext4_journal_test_restart(handle, inode);
+			}
+
+			ext4_free_blocks(handle, inode, nr, 1);
+
+			if (parent_bh) {
+				/*
+				 * The block which we have just freed is
+				 * pointed to by an indirect block: journal it
+				 */
+				BUFFER_TRACE(parent_bh, "get_write_access");
+				if (!ext4_journal_get_write_access(handle,
+								   parent_bh)){
+					*p = 0;
+					BUFFER_TRACE(parent_bh,
+					"call ext4_journal_dirty_metadata");
+					ext4_journal_dirty_metadata(handle,
+								    parent_bh);
+				}
+			}
+		}
+	} else {
+		/* We have reached the bottom of the tree. */
+		BUFFER_TRACE(parent_bh, "free data blocks");
+		ext4_free_data(handle, inode, parent_bh, first, last);
+	}
+}
+
+/*
+ * ext4_truncate()
+ *
+ * We block out ext4_get_block() block instantiations across the entire
+ * transaction, and VFS/VM ensures that ext4_truncate() cannot run
+ * simultaneously on behalf of the same inode.
+ *
+ * As we work through the truncate and commmit bits of it to the journal there
+ * is one core, guiding principle: the file's tree must always be consistent on
+ * disk.  We must be able to restart the truncate after a crash.
+ *
+ * The file's tree may be transiently inconsistent in memory (although it
+ * probably isn't), but whenever we close off and commit a journal transaction,
+ * the contents of (the filesystem + the journal) must be consistent and
+ * restartable.  It's pretty simple, really: bottom up, right to left (although
+ * left-to-right works OK too).
+ *
+ * Note that at recovery time, journal replay occurs *before* the restart of
+ * truncate against the orphan inode list.
+ *
+ * The committed inode has the new, desired i_size (which is the same as
+ * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
+ * that this inode's truncate did not complete and it will again call
+ * ext4_truncate() to have another go.  So there will be instantiated blocks
+ * to the right of the truncation point in a crashed ext4 filesystem.  But
+ * that's fine - as long as they are linked from the inode, the post-crash
+ * ext4_truncate() run will find them and release them.
+ */
+void ext4_truncate(struct inode *inode)
+{
+	handle_t *handle;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	__le32 *i_data = ei->i_data;
+	int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+	struct address_space *mapping = inode->i_mapping;
+	int offsets[4];
+	Indirect chain[4];
+	Indirect *partial;
+	__le32 nr = 0;
+	int n;
+	long last_block;
+	unsigned blocksize = inode->i_sb->s_blocksize;
+	struct page *page;
+
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	    S_ISLNK(inode->i_mode)))
+		return;
+	if (ext4_inode_is_fast_symlink(inode))
+		return;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return;
+
+	/*
+	 * We have to lock the EOF page here, because lock_page() nests
+	 * outside jbd2_journal_start().
+	 */
+	if ((inode->i_size & (blocksize - 1)) == 0) {
+		/* Block boundary? Nothing to do */
+		page = NULL;
+	} else {
+		page = grab_cache_page(mapping,
+				inode->i_size >> PAGE_CACHE_SHIFT);
+		if (!page)
+			return;
+	}
+
+	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+		return ext4_ext_truncate(inode, page);
+
+	handle = start_transaction(inode);
+	if (IS_ERR(handle)) {
+		if (page) {
+			clear_highpage(page);
+			flush_dcache_page(page);
+			unlock_page(page);
+			page_cache_release(page);
+		}
+		return;		/* AKPM: return what? */
+	}
+
+	last_block = (inode->i_size + blocksize-1)
+					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
+
+	if (page)
+		ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+
+	n = ext4_block_to_path(inode, last_block, offsets, NULL);
+	if (n == 0)
+		goto out_stop;	/* error */
+
+	/*
+	 * OK.  This truncate is going to happen.  We add the inode to the
+	 * orphan list, so that if this truncate spans multiple transactions,
+	 * and we crash, we will resume the truncate when the filesystem
+	 * recovers.  It also marks the inode dirty, to catch the new size.
+	 *
+	 * Implication: the file must always be in a sane, consistent
+	 * truncatable state while each transaction commits.
+	 */
+	if (ext4_orphan_add(handle, inode))
+		goto out_stop;
+
+	/*
+	 * The orphan list entry will now protect us from any crash which
+	 * occurs before the truncate completes, so it is now safe to propagate
+	 * the new, shorter inode size (held for now in i_size) into the
+	 * on-disk inode. We do this via i_disksize, which is the value which
+	 * ext4 *really* writes onto the disk inode.
+	 */
+	ei->i_disksize = inode->i_size;
+
+	/*
+	 * From here we block out all ext4_get_block() callers who want to
+	 * modify the block allocation tree.
+	 */
+	mutex_lock(&ei->truncate_mutex);
+
+	if (n == 1) {		/* direct blocks */
+		ext4_free_data(handle, inode, NULL, i_data+offsets[0],
+			       i_data + EXT4_NDIR_BLOCKS);
+		goto do_indirects;
+	}
+
+	partial = ext4_find_shared(inode, n, offsets, chain, &nr);
+	/* Kill the top of shared branch (not detached) */
+	if (nr) {
+		if (partial == chain) {
+			/* Shared branch grows from the inode */
+			ext4_free_branches(handle, inode, NULL,
+					   &nr, &nr+1, (chain+n-1) - partial);
+			*partial->p = 0;
+			/*
+			 * We mark the inode dirty prior to restart,
+			 * and prior to stop.  No need for it here.
+			 */
+		} else {
+			/* Shared branch grows from an indirect block */
+			BUFFER_TRACE(partial->bh, "get_write_access");
+			ext4_free_branches(handle, inode, partial->bh,
+					partial->p,
+					partial->p+1, (chain+n-1) - partial);
+		}
+	}
+	/* Clear the ends of indirect blocks on the shared branch */
+	while (partial > chain) {
+		ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
+				   (__le32*)partial->bh->b_data+addr_per_block,
+				   (chain+n-1) - partial);
+		BUFFER_TRACE(partial->bh, "call brelse");
+		brelse (partial->bh);
+		partial--;
+	}
+do_indirects:
+	/* Kill the remaining (whole) subtrees */
+	switch (offsets[0]) {
+	default:
+		nr = i_data[EXT4_IND_BLOCK];
+		if (nr) {
+			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
+			i_data[EXT4_IND_BLOCK] = 0;
+		}
+	case EXT4_IND_BLOCK:
+		nr = i_data[EXT4_DIND_BLOCK];
+		if (nr) {
+			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
+			i_data[EXT4_DIND_BLOCK] = 0;
+		}
+	case EXT4_DIND_BLOCK:
+		nr = i_data[EXT4_TIND_BLOCK];
+		if (nr) {
+			ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
+			i_data[EXT4_TIND_BLOCK] = 0;
+		}
+	case EXT4_TIND_BLOCK:
+		;
+	}
+
+	ext4_discard_reservation(inode);
+
+	mutex_unlock(&ei->truncate_mutex);
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+	ext4_mark_inode_dirty(handle, inode);
+
+	/*
+	 * In a multi-transaction truncate, we only make the final transaction
+	 * synchronous
+	 */
+	if (IS_SYNC(inode))
+		handle->h_sync = 1;
+out_stop:
+	/*
+	 * If this was a simple ftruncate(), and the file will remain alive
+	 * then we need to clear up the orphan record which we created above.
+	 * However, if this was a real unlink then we were called by
+	 * ext4_delete_inode(), and we allow that function to clean up the
+	 * orphan info for us.
+	 */
+	if (inode->i_nlink)
+		ext4_orphan_del(handle, inode);
+
+	ext4_journal_stop(handle);
+}
+
+static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
+		unsigned long ino, struct ext4_iloc *iloc)
+{
+	unsigned long desc, group_desc, block_group;
+	unsigned long offset;
+	ext4_fsblk_t block;
+	struct buffer_head *bh;
+	struct ext4_group_desc * gdp;
+
+	if (!ext4_valid_inum(sb, ino)) {
+		/*
+		 * This error is already checked for in namei.c unless we are
+		 * looking at an NFS filehandle, in which case no error
+		 * report is needed
+		 */
+		return 0;
+	}
+
+	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+	if (block_group >= EXT4_SB(sb)->s_groups_count) {
+		ext4_error(sb,"ext4_get_inode_block","group >= groups count");
+		return 0;
+	}
+	smp_rmb();
+	group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
+	desc = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+	bh = EXT4_SB(sb)->s_group_desc[group_desc];
+	if (!bh) {
+		ext4_error (sb, "ext4_get_inode_block",
+			    "Descriptor not loaded");
+		return 0;
+	}
+
+	gdp = (struct ext4_group_desc *)((__u8 *)bh->b_data +
+		desc * EXT4_DESC_SIZE(sb));
+	/*
+	 * Figure out the offset within the block group inode table
+	 */
+	offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
+		EXT4_INODE_SIZE(sb);
+	block = ext4_inode_table(sb, gdp) +
+		(offset >> EXT4_BLOCK_SIZE_BITS(sb));
+
+	iloc->block_group = block_group;
+	iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
+	return block;
+}
+
+/*
+ * ext4_get_inode_loc returns with an extra refcount against the inode's
+ * underlying buffer_head on success. If 'in_mem' is true, we have all
+ * data in memory that is needed to recreate the on-disk version of this
+ * inode.
+ */
+static int __ext4_get_inode_loc(struct inode *inode,
+				struct ext4_iloc *iloc, int in_mem)
+{
+	ext4_fsblk_t block;
+	struct buffer_head *bh;
+
+	block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
+	if (!block)
+		return -EIO;
+
+	bh = sb_getblk(inode->i_sb, block);
+	if (!bh) {
+		ext4_error (inode->i_sb, "ext4_get_inode_loc",
+				"unable to read inode block - "
+				"inode=%lu, block=%llu",
+				 inode->i_ino, block);
+		return -EIO;
+	}
+	if (!buffer_uptodate(bh)) {
+		lock_buffer(bh);
+		if (buffer_uptodate(bh)) {
+			/* someone brought it uptodate while we waited */
+			unlock_buffer(bh);
+			goto has_buffer;
+		}
+
+		/*
+		 * If we have all information of the inode in memory and this
+		 * is the only valid inode in the block, we need not read the
+		 * block.
+		 */
+		if (in_mem) {
+			struct buffer_head *bitmap_bh;
+			struct ext4_group_desc *desc;
+			int inodes_per_buffer;
+			int inode_offset, i;
+			int block_group;
+			int start;
+
+			block_group = (inode->i_ino - 1) /
+					EXT4_INODES_PER_GROUP(inode->i_sb);
+			inodes_per_buffer = bh->b_size /
+				EXT4_INODE_SIZE(inode->i_sb);
+			inode_offset = ((inode->i_ino - 1) %
+					EXT4_INODES_PER_GROUP(inode->i_sb));
+			start = inode_offset & ~(inodes_per_buffer - 1);
+
+			/* Is the inode bitmap in cache? */
+			desc = ext4_get_group_desc(inode->i_sb,
+						block_group, NULL);
+			if (!desc)
+				goto make_io;
+
+			bitmap_bh = sb_getblk(inode->i_sb,
+				ext4_inode_bitmap(inode->i_sb, desc));
+			if (!bitmap_bh)
+				goto make_io;
+
+			/*
+			 * If the inode bitmap isn't in cache then the
+			 * optimisation may end up performing two reads instead
+			 * of one, so skip it.
+			 */
+			if (!buffer_uptodate(bitmap_bh)) {
+				brelse(bitmap_bh);
+				goto make_io;
+			}
+			for (i = start; i < start + inodes_per_buffer; i++) {
+				if (i == inode_offset)
+					continue;
+				if (ext4_test_bit(i, bitmap_bh->b_data))
+					break;
+			}
+			brelse(bitmap_bh);
+			if (i == start + inodes_per_buffer) {
+				/* all other inodes are free, so skip I/O */
+				memset(bh->b_data, 0, bh->b_size);
+				set_buffer_uptodate(bh);
+				unlock_buffer(bh);
+				goto has_buffer;
+			}
+		}
+
+make_io:
+		/*
+		 * There are other valid inodes in the buffer, this inode
+		 * has in-inode xattrs, or we don't have this inode in memory.
+		 * Read the block from disk.
+		 */
+		get_bh(bh);
+		bh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ_META, bh);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			ext4_error(inode->i_sb, "ext4_get_inode_loc",
+					"unable to read inode block - "
+					"inode=%lu, block=%llu",
+					inode->i_ino, block);
+			brelse(bh);
+			return -EIO;
+		}
+	}
+has_buffer:
+	iloc->bh = bh;
+	return 0;
+}
+
+int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
+{
+	/* We have all inode data except xattrs in memory here. */
+	return __ext4_get_inode_loc(inode, iloc,
+		!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
+}
+
+void ext4_set_inode_flags(struct inode *inode)
+{
+	unsigned int flags = EXT4_I(inode)->i_flags;
+
+	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+	if (flags & EXT4_SYNC_FL)
+		inode->i_flags |= S_SYNC;
+	if (flags & EXT4_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (flags & EXT4_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+	if (flags & EXT4_NOATIME_FL)
+		inode->i_flags |= S_NOATIME;
+	if (flags & EXT4_DIRSYNC_FL)
+		inode->i_flags |= S_DIRSYNC;
+}
+
+void ext4_read_inode(struct inode * inode)
+{
+	struct ext4_iloc iloc;
+	struct ext4_inode *raw_inode;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct buffer_head *bh;
+	int block;
+
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+	ei->i_acl = EXT4_ACL_NOT_CACHED;
+	ei->i_default_acl = EXT4_ACL_NOT_CACHED;
+#endif
+	ei->i_block_alloc_info = NULL;
+
+	if (__ext4_get_inode_loc(inode, &iloc, 0))
+		goto bad_inode;
+	bh = iloc.bh;
+	raw_inode = ext4_raw_inode(&iloc);
+	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+	if(!(test_opt (inode->i_sb, NO_UID32))) {
+		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+	}
+	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+	inode->i_size = le32_to_cpu(raw_inode->i_size);
+	inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
+	inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
+	inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
+	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
+
+	ei->i_state = 0;
+	ei->i_dir_start_lookup = 0;
+	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
+	/* We now have enough fields to check if the inode was active or not.
+	 * This is needed because nfsd might try to access dead inodes
+	 * the test is that same one that e2fsck uses
+	 * NeilBrown 1999oct15
+	 */
+	if (inode->i_nlink == 0) {
+		if (inode->i_mode == 0 ||
+		    !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
+			/* this inode is deleted */
+			brelse (bh);
+			goto bad_inode;
+		}
+		/* The only unlinked inodes we let through here have
+		 * valid i_mode and are being read by the orphan
+		 * recovery code: that's fine, we're about to complete
+		 * the process of deleting those. */
+	}
+	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
+	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
+#ifdef EXT4_FRAGMENTS
+	ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
+	ei->i_frag_no = raw_inode->i_frag;
+	ei->i_frag_size = raw_inode->i_fsize;
+#endif
+	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+	    cpu_to_le32(EXT4_OS_HURD))
+		ei->i_file_acl |=
+			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
+	if (!S_ISREG(inode->i_mode)) {
+		ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
+	} else {
+		inode->i_size |=
+			((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
+	}
+	ei->i_disksize = inode->i_size;
+	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+	ei->i_block_group = iloc.block_group;
+	/*
+	 * NOTE! The in-memory inode i_data array is in little-endian order
+	 * even on big-endian machines: we do NOT byteswap the block numbers!
+	 */
+	for (block = 0; block < EXT4_N_BLOCKS; block++)
+		ei->i_data[block] = raw_inode->i_block[block];
+	INIT_LIST_HEAD(&ei->i_orphan);
+
+	if (inode->i_ino >= EXT4_FIRST_INO(inode->i_sb) + 1 &&
+	    EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+		/*
+		 * When mke2fs creates big inodes it does not zero out
+		 * the unused bytes above EXT4_GOOD_OLD_INODE_SIZE,
+		 * so ignore those first few inodes.
+		 */
+		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
+		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
+		    EXT4_INODE_SIZE(inode->i_sb))
+			goto bad_inode;
+		if (ei->i_extra_isize == 0) {
+			/* The extra space is currently unused. Use it. */
+			ei->i_extra_isize = sizeof(struct ext4_inode) -
+					    EXT4_GOOD_OLD_INODE_SIZE;
+		} else {
+			__le32 *magic = (void *)raw_inode +
+					EXT4_GOOD_OLD_INODE_SIZE +
+					ei->i_extra_isize;
+			if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
+				 ei->i_state |= EXT4_STATE_XATTR;
+		}
+	} else
+		ei->i_extra_isize = 0;
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &ext4_file_inode_operations;
+		inode->i_fop = &ext4_file_operations;
+		ext4_set_aops(inode);
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &ext4_dir_inode_operations;
+		inode->i_fop = &ext4_dir_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (ext4_inode_is_fast_symlink(inode))
+			inode->i_op = &ext4_fast_symlink_inode_operations;
+		else {
+			inode->i_op = &ext4_symlink_inode_operations;
+			ext4_set_aops(inode);
+		}
+	} else {
+		inode->i_op = &ext4_special_inode_operations;
+		if (raw_inode->i_block[0])
+			init_special_inode(inode, inode->i_mode,
+			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
+		else
+			init_special_inode(inode, inode->i_mode,
+			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+	}
+	brelse (iloc.bh);
+	ext4_set_inode_flags(inode);
+	return;
+
+bad_inode:
+	make_bad_inode(inode);
+	return;
+}
+
+/*
+ * Post the struct inode info into an on-disk inode location in the
+ * buffer-cache.  This gobbles the caller's reference to the
+ * buffer_head in the inode location struct.
+ *
+ * The caller must have write access to iloc->bh.
+ */
+static int ext4_do_update_inode(handle_t *handle,
+				struct inode *inode,
+				struct ext4_iloc *iloc)
+{
+	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct buffer_head *bh = iloc->bh;
+	int err = 0, rc, block;
+
+	/* For fields not not tracking in the in-memory inode,
+	 * initialise them to zero for new inodes. */
+	if (ei->i_state & EXT4_STATE_NEW)
+		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
+
+	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+	if(!(test_opt(inode->i_sb, NO_UID32))) {
+		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
+		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+/*
+ * Fix up interoperability with old kernels. Otherwise, old inodes get
+ * re-used with the upper 16 bits of the uid/gid intact
+ */
+		if(!ei->i_dtime) {
+			raw_inode->i_uid_high =
+				cpu_to_le16(high_16_bits(inode->i_uid));
+			raw_inode->i_gid_high =
+				cpu_to_le16(high_16_bits(inode->i_gid));
+		} else {
+			raw_inode->i_uid_high = 0;
+			raw_inode->i_gid_high = 0;
+		}
+	} else {
+		raw_inode->i_uid_low =
+			cpu_to_le16(fs_high2lowuid(inode->i_uid));
+		raw_inode->i_gid_low =
+			cpu_to_le16(fs_high2lowgid(inode->i_gid));
+		raw_inode->i_uid_high = 0;
+		raw_inode->i_gid_high = 0;
+	}
+	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+	raw_inode->i_size = cpu_to_le32(ei->i_disksize);
+	raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+	raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
+	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
+	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+#ifdef EXT4_FRAGMENTS
+	raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
+	raw_inode->i_frag = ei->i_frag_no;
+	raw_inode->i_fsize = ei->i_frag_size;
+#endif
+	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+	    cpu_to_le32(EXT4_OS_HURD))
+		raw_inode->i_file_acl_high =
+			cpu_to_le16(ei->i_file_acl >> 32);
+	raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
+	if (!S_ISREG(inode->i_mode)) {
+		raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
+	} else {
+		raw_inode->i_size_high =
+			cpu_to_le32(ei->i_disksize >> 32);
+		if (ei->i_disksize > 0x7fffffffULL) {
+			struct super_block *sb = inode->i_sb;
+			if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
+			    EXT4_SB(sb)->s_es->s_rev_level ==
+					cpu_to_le32(EXT4_GOOD_OLD_REV)) {
+			       /* If this is the first large file
+				* created, add a flag to the superblock.
+				*/
+				err = ext4_journal_get_write_access(handle,
+						EXT4_SB(sb)->s_sbh);
+				if (err)
+					goto out_brelse;
+				ext4_update_dynamic_rev(sb);
+				EXT4_SET_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+				sb->s_dirt = 1;
+				handle->h_sync = 1;
+				err = ext4_journal_dirty_metadata(handle,
+						EXT4_SB(sb)->s_sbh);
+			}
+		}
+	}
+	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (old_valid_dev(inode->i_rdev)) {
+			raw_inode->i_block[0] =
+				cpu_to_le32(old_encode_dev(inode->i_rdev));
+			raw_inode->i_block[1] = 0;
+		} else {
+			raw_inode->i_block[0] = 0;
+			raw_inode->i_block[1] =
+				cpu_to_le32(new_encode_dev(inode->i_rdev));
+			raw_inode->i_block[2] = 0;
+		}
+	} else for (block = 0; block < EXT4_N_BLOCKS; block++)
+		raw_inode->i_block[block] = ei->i_data[block];
+
+	if (ei->i_extra_isize)
+		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+
+	BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+	rc = ext4_journal_dirty_metadata(handle, bh);
+	if (!err)
+		err = rc;
+	ei->i_state &= ~EXT4_STATE_NEW;
+
+out_brelse:
+	brelse (bh);
+	ext4_std_error(inode->i_sb, err);
+	return err;
+}
+
+/*
+ * ext4_write_inode()
+ *
+ * We are called from a few places:
+ *
+ * - Within generic_file_write() for O_SYNC files.
+ *   Here, there will be no transaction running. We wait for any running
+ *   trasnaction to commit.
+ *
+ * - Within sys_sync(), kupdate and such.
+ *   We wait on commit, if tol to.
+ *
+ * - Within prune_icache() (PF_MEMALLOC == true)
+ *   Here we simply return.  We can't afford to block kswapd on the
+ *   journal commit.
+ *
+ * In all cases it is actually safe for us to return without doing anything,
+ * because the inode has been copied into a raw inode buffer in
+ * ext4_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
+ * knfsd.
+ *
+ * Note that we are absolutely dependent upon all inode dirtiers doing the
+ * right thing: they *must* call mark_inode_dirty() after dirtying info in
+ * which we are interested.
+ *
+ * It would be a bug for them to not do this.  The code:
+ *
+ *	mark_inode_dirty(inode)
+ *	stuff();
+ *	inode->i_size = expr;
+ *
+ * is in error because a kswapd-driven write_inode() could occur while
+ * `stuff()' is running, and the new i_size will be lost.  Plus the inode
+ * will no longer be on the superblock's dirty inode list.
+ */
+int ext4_write_inode(struct inode *inode, int wait)
+{
+	if (current->flags & PF_MEMALLOC)
+		return 0;
+
+	if (ext4_journal_current_handle()) {
+		jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
+		dump_stack();
+		return -EIO;
+	}
+
+	if (!wait)
+		return 0;
+
+	return ext4_force_commit(inode->i_sb);
+}
+
+/*
+ * ext4_setattr()
+ *
+ * Called from notify_change.
+ *
+ * We want to trap VFS attempts to truncate the file as soon as
+ * possible.  In particular, we want to make sure that when the VFS
+ * shrinks i_size, we put the inode on the orphan list and modify
+ * i_disksize immediately, so that during the subsequent flushing of
+ * dirty pages and freeing of disk blocks, we can guarantee that any
+ * commit will leave the blocks being flushed in an unused state on
+ * disk.  (On recovery, the inode will get truncated and the blocks will
+ * be freed, so we have a strong guarantee that no future commit will
+ * leave these blocks visible to the user.)
+ *
+ * Called with inode->sem down.
+ */
+int ext4_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error, rc = 0;
+	const unsigned int ia_valid = attr->ia_valid;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		return error;
+
+	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+		(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+		handle_t *handle;
+
+		/* (user+group)*(old+new) structure, inode write (sb,
+		 * inode block, ? - but truncate inode update has it) */
+		handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
+					EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+		if (IS_ERR(handle)) {
+			error = PTR_ERR(handle);
+			goto err_out;
+		}
+		error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+		if (error) {
+			ext4_journal_stop(handle);
+			return error;
+		}
+		/* Update corresponding info in inode so that everything is in
+		 * one transaction */
+		if (attr->ia_valid & ATTR_UID)
+			inode->i_uid = attr->ia_uid;
+		if (attr->ia_valid & ATTR_GID)
+			inode->i_gid = attr->ia_gid;
+		error = ext4_mark_inode_dirty(handle, inode);
+		ext4_journal_stop(handle);
+	}
+
+	if (S_ISREG(inode->i_mode) &&
+	    attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
+		handle_t *handle;
+
+		handle = ext4_journal_start(inode, 3);
+		if (IS_ERR(handle)) {
+			error = PTR_ERR(handle);
+			goto err_out;
+		}
+
+		error = ext4_orphan_add(handle, inode);
+		EXT4_I(inode)->i_disksize = attr->ia_size;
+		rc = ext4_mark_inode_dirty(handle, inode);
+		if (!error)
+			error = rc;
+		ext4_journal_stop(handle);
+	}
+
+	rc = inode_setattr(inode, attr);
+
+	/* If inode_setattr's call to ext4_truncate failed to get a
+	 * transaction handle at all, we need to clean up the in-core
+	 * orphan list manually. */
+	if (inode->i_nlink)
+		ext4_orphan_del(NULL, inode);
+
+	if (!rc && (ia_valid & ATTR_MODE))
+		rc = ext4_acl_chmod(inode);
+
+err_out:
+	ext4_std_error(inode->i_sb, error);
+	if (!error)
+		error = rc;
+	return error;
+}
+
+
+/*
+ * How many blocks doth make a writepage()?
+ *
+ * With N blocks per page, it may be:
+ * N data blocks
+ * 2 indirect block
+ * 2 dindirect
+ * 1 tindirect
+ * N+5 bitmap blocks (from the above)
+ * N+5 group descriptor summary blocks
+ * 1 inode block
+ * 1 superblock.
+ * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
+ *
+ * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
+ *
+ * With ordered or writeback data it's the same, less the N data blocks.
+ *
+ * If the inode's direct blocks can hold an integral number of pages then a
+ * page cannot straddle two indirect blocks, and we can only touch one indirect
+ * and dindirect block, and the "5" above becomes "3".
+ *
+ * This still overestimates under most circumstances.  If we were to pass the
+ * start and end offsets in here as well we could do block_to_path() on each
+ * block and work out the exact number of indirects which are touched.  Pah.
+ */
+
+int ext4_writepage_trans_blocks(struct inode *inode)
+{
+	int bpp = ext4_journal_blocks_per_page(inode);
+	int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
+	int ret;
+
+	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+		return ext4_ext_writepage_trans_blocks(inode, bpp);
+
+	if (ext4_should_journal_data(inode))
+		ret = 3 * (bpp + indirects) + 2;
+	else
+		ret = 2 * (bpp + indirects) + 2;
+
+#ifdef CONFIG_QUOTA
+	/* We know that structure was already allocated during DQUOT_INIT so
+	 * we will be updating only the data blocks + inodes */
+	ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+#endif
+
+	return ret;
+}
+
+/*
+ * The caller must have previously called ext4_reserve_inode_write().
+ * Give this, we know that the caller already has write access to iloc->bh.
+ */
+int ext4_mark_iloc_dirty(handle_t *handle,
+		struct inode *inode, struct ext4_iloc *iloc)
+{
+	int err = 0;
+
+	/* the do_update_inode consumes one bh->b_count */
+	get_bh(iloc->bh);
+
+	/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
+	err = ext4_do_update_inode(handle, inode, iloc);
+	put_bh(iloc->bh);
+	return err;
+}
+
+/*
+ * On success, We end up with an outstanding reference count against
+ * iloc->bh.  This _must_ be cleaned up later.
+ */
+
+int
+ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
+			 struct ext4_iloc *iloc)
+{
+	int err = 0;
+	if (handle) {
+		err = ext4_get_inode_loc(inode, iloc);
+		if (!err) {
+			BUFFER_TRACE(iloc->bh, "get_write_access");
+			err = ext4_journal_get_write_access(handle, iloc->bh);
+			if (err) {
+				brelse(iloc->bh);
+				iloc->bh = NULL;
+			}
+		}
+	}
+	ext4_std_error(inode->i_sb, err);
+	return err;
+}
+
+/*
+ * What we do here is to mark the in-core inode as clean with respect to inode
+ * dirtiness (it may still be data-dirty).
+ * This means that the in-core inode may be reaped by prune_icache
+ * without having to perform any I/O.  This is a very good thing,
+ * because *any* task may call prune_icache - even ones which
+ * have a transaction open against a different journal.
+ *
+ * Is this cheating?  Not really.  Sure, we haven't written the
+ * inode out, but prune_icache isn't a user-visible syncing function.
+ * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
+ * we start and wait on commits.
+ *
+ * Is this efficient/effective?  Well, we're being nice to the system
+ * by cleaning up our inodes proactively so they can be reaped
+ * without I/O.  But we are potentially leaving up to five seconds'
+ * worth of inodes floating about which prune_icache wants us to
+ * write out.  One way to fix that would be to get prune_icache()
+ * to do a write_super() to free up some memory.  It has the desired
+ * effect.
+ */
+int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
+{
+	struct ext4_iloc iloc;
+	int err;
+
+	might_sleep();
+	err = ext4_reserve_inode_write(handle, inode, &iloc);
+	if (!err)
+		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+	return err;
+}
+
+/*
+ * ext4_dirty_inode() is called from __mark_inode_dirty()
+ *
+ * We're really interested in the case where a file is being extended.
+ * i_size has been changed by generic_commit_write() and we thus need
+ * to include the updated inode in the current transaction.
+ *
+ * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
+ * are allocated to the file.
+ *
+ * If the inode is marked synchronous, we don't honour that here - doing
+ * so would cause a commit on atime updates, which we don't bother doing.
+ * We handle synchronous inodes at the highest possible level.
+ */
+void ext4_dirty_inode(struct inode *inode)
+{
+	handle_t *current_handle = ext4_journal_current_handle();
+	handle_t *handle;
+
+	handle = ext4_journal_start(inode, 2);
+	if (IS_ERR(handle))
+		goto out;
+	if (current_handle &&
+		current_handle->h_transaction != handle->h_transaction) {
+		/* This task has a transaction open against a different fs */
+		printk(KERN_EMERG "%s: transactions do not match!\n",
+		       __FUNCTION__);
+	} else {
+		jbd_debug(5, "marking dirty.  outer handle=%p\n",
+				current_handle);
+		ext4_mark_inode_dirty(handle, inode);
+	}
+	ext4_journal_stop(handle);
+out:
+	return;
+}
+
+#if 0
+/*
+ * Bind an inode's backing buffer_head into this transaction, to prevent
+ * it from being flushed to disk early.  Unlike
+ * ext4_reserve_inode_write, this leaves behind no bh reference and
+ * returns no iloc structure, so the caller needs to repeat the iloc
+ * lookup to mark the inode dirty later.
+ */
+static int ext4_pin_inode(handle_t *handle, struct inode *inode)
+{
+	struct ext4_iloc iloc;
+
+	int err = 0;
+	if (handle) {
+		err = ext4_get_inode_loc(inode, &iloc);
+		if (!err) {
+			BUFFER_TRACE(iloc.bh, "get_write_access");
+			err = jbd2_journal_get_write_access(handle, iloc.bh);
+			if (!err)
+				err = ext4_journal_dirty_metadata(handle,
+								  iloc.bh);
+			brelse(iloc.bh);
+		}
+	}
+	ext4_std_error(inode->i_sb, err);
+	return err;
+}
+#endif
+
+int ext4_change_inode_journal_flag(struct inode *inode, int val)
+{
+	journal_t *journal;
+	handle_t *handle;
+	int err;
+
+	/*
+	 * We have to be very careful here: changing a data block's
+	 * journaling status dynamically is dangerous.  If we write a
+	 * data block to the journal, change the status and then delete
+	 * that block, we risk forgetting to revoke the old log record
+	 * from the journal and so a subsequent replay can corrupt data.
+	 * So, first we make sure that the journal is empty and that
+	 * nobody is changing anything.
+	 */
+
+	journal = EXT4_JOURNAL(inode);
+	if (is_journal_aborted(journal) || IS_RDONLY(inode))
+		return -EROFS;
+
+	jbd2_journal_lock_updates(journal);
+	jbd2_journal_flush(journal);
+
+	/*
+	 * OK, there are no updates running now, and all cached data is
+	 * synced to disk.  We are now in a completely consistent state
+	 * which doesn't have anything in the journal, and we know that
+	 * no filesystem updates are running, so it is safe to modify
+	 * the inode's in-core data-journaling state flag now.
+	 */
+
+	if (val)
+		EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
+	else
+		EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
+	ext4_set_aops(inode);
+
+	jbd2_journal_unlock_updates(journal);
+
+	/* Finally we can mark the inode as dirty. */
+
+	handle = ext4_journal_start(inode, 1);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	err = ext4_mark_inode_dirty(handle, inode);
+	handle->h_sync = 1;
+	ext4_journal_stop(handle);
+	ext4_std_error(inode->i_sb, err);
+
+	return err;
+}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
new file mode 100644
index 000000000000..22a737c306c7
--- /dev/null
+++ b/fs/ext4/ioctl.c
@@ -0,0 +1,306 @@
+/*
+ * linux/fs/ext4/ioctl.c
+ *
+ * Copyright (C) 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/capability.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/time.h>
+#include <linux/compat.h>
+#include <linux/smp_lock.h>
+#include <asm/uaccess.h>
+
+int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
+		unsigned long arg)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned int flags;
+	unsigned short rsv_window_size;
+
+	ext4_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+
+	switch (cmd) {
+	case EXT4_IOC_GETFLAGS:
+		flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
+		return put_user(flags, (int __user *) arg);
+	case EXT4_IOC_SETFLAGS: {
+		handle_t *handle = NULL;
+		int err;
+		struct ext4_iloc iloc;
+		unsigned int oldflags;
+		unsigned int jflag;
+
+		if (IS_RDONLY(inode))
+			return -EROFS;
+
+		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+			return -EACCES;
+
+		if (get_user(flags, (int __user *) arg))
+			return -EFAULT;
+
+		if (!S_ISDIR(inode->i_mode))
+			flags &= ~EXT4_DIRSYNC_FL;
+
+		mutex_lock(&inode->i_mutex);
+		oldflags = ei->i_flags;
+
+		/* The JOURNAL_DATA flag is modifiable only by root */
+		jflag = flags & EXT4_JOURNAL_DATA_FL;
+
+		/*
+		 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+		 * the relevant capability.
+		 *
+		 * This test looks nicer. Thanks to Pauline Middelink
+		 */
+		if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
+			if (!capable(CAP_LINUX_IMMUTABLE)) {
+				mutex_unlock(&inode->i_mutex);
+				return -EPERM;
+			}
+		}
+
+		/*
+		 * The JOURNAL_DATA flag can only be changed by
+		 * the relevant capability.
+		 */
+		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+			if (!capable(CAP_SYS_RESOURCE)) {
+				mutex_unlock(&inode->i_mutex);
+				return -EPERM;
+			}
+		}
+
+
+		handle = ext4_journal_start(inode, 1);
+		if (IS_ERR(handle)) {
+			mutex_unlock(&inode->i_mutex);
+			return PTR_ERR(handle);
+		}
+		if (IS_SYNC(inode))
+			handle->h_sync = 1;
+		err = ext4_reserve_inode_write(handle, inode, &iloc);
+		if (err)
+			goto flags_err;
+
+		flags = flags & EXT4_FL_USER_MODIFIABLE;
+		flags |= oldflags & ~EXT4_FL_USER_MODIFIABLE;
+		ei->i_flags = flags;
+
+		ext4_set_inode_flags(inode);
+		inode->i_ctime = CURRENT_TIME_SEC;
+
+		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+flags_err:
+		ext4_journal_stop(handle);
+		if (err) {
+			mutex_unlock(&inode->i_mutex);
+			return err;
+		}
+
+		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
+			err = ext4_change_inode_journal_flag(inode, jflag);
+		mutex_unlock(&inode->i_mutex);
+		return err;
+	}
+	case EXT4_IOC_GETVERSION:
+	case EXT4_IOC_GETVERSION_OLD:
+		return put_user(inode->i_generation, (int __user *) arg);
+	case EXT4_IOC_SETVERSION:
+	case EXT4_IOC_SETVERSION_OLD: {
+		handle_t *handle;
+		struct ext4_iloc iloc;
+		__u32 generation;
+		int err;
+
+		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+			return -EPERM;
+		if (IS_RDONLY(inode))
+			return -EROFS;
+		if (get_user(generation, (int __user *) arg))
+			return -EFAULT;
+
+		handle = ext4_journal_start(inode, 1);
+		if (IS_ERR(handle))
+			return PTR_ERR(handle);
+		err = ext4_reserve_inode_write(handle, inode, &iloc);
+		if (err == 0) {
+			inode->i_ctime = CURRENT_TIME_SEC;
+			inode->i_generation = generation;
+			err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+		}
+		ext4_journal_stop(handle);
+		return err;
+	}
+#ifdef CONFIG_JBD_DEBUG
+	case EXT4_IOC_WAIT_FOR_READONLY:
+		/*
+		 * This is racy - by the time we're woken up and running,
+		 * the superblock could be released.  And the module could
+		 * have been unloaded.  So sue me.
+		 *
+		 * Returns 1 if it slept, else zero.
+		 */
+		{
+			struct super_block *sb = inode->i_sb;
+			DECLARE_WAITQUEUE(wait, current);
+			int ret = 0;
+
+			set_current_state(TASK_INTERRUPTIBLE);
+			add_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
+			if (timer_pending(&EXT4_SB(sb)->turn_ro_timer)) {
+				schedule();
+				ret = 1;
+			}
+			remove_wait_queue(&EXT4_SB(sb)->ro_wait_queue, &wait);
+			return ret;
+		}
+#endif
+	case EXT4_IOC_GETRSVSZ:
+		if (test_opt(inode->i_sb, RESERVATION)
+			&& S_ISREG(inode->i_mode)
+			&& ei->i_block_alloc_info) {
+			rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
+			return put_user(rsv_window_size, (int __user *)arg);
+		}
+		return -ENOTTY;
+	case EXT4_IOC_SETRSVSZ: {
+
+		if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
+			return -ENOTTY;
+
+		if (IS_RDONLY(inode))
+			return -EROFS;
+
+		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+			return -EACCES;
+
+		if (get_user(rsv_window_size, (int __user *)arg))
+			return -EFAULT;
+
+		if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
+			rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
+
+		/*
+		 * need to allocate reservation structure for this inode
+		 * before set the window size
+		 */
+		mutex_lock(&ei->truncate_mutex);
+		if (!ei->i_block_alloc_info)
+			ext4_init_block_alloc_info(inode);
+
+		if (ei->i_block_alloc_info){
+			struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
+			rsv->rsv_goal_size = rsv_window_size;
+		}
+		mutex_unlock(&ei->truncate_mutex);
+		return 0;
+	}
+	case EXT4_IOC_GROUP_EXTEND: {
+		ext4_fsblk_t n_blocks_count;
+		struct super_block *sb = inode->i_sb;
+		int err;
+
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+
+		if (IS_RDONLY(inode))
+			return -EROFS;
+
+		if (get_user(n_blocks_count, (__u32 __user *)arg))
+			return -EFAULT;
+
+		err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
+		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+		jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+
+		return err;
+	}
+	case EXT4_IOC_GROUP_ADD: {
+		struct ext4_new_group_data input;
+		struct super_block *sb = inode->i_sb;
+		int err;
+
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+
+		if (IS_RDONLY(inode))
+			return -EROFS;
+
+		if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
+				sizeof(input)))
+			return -EFAULT;
+
+		err = ext4_group_add(sb, &input);
+		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+		jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+
+		return err;
+	}
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	int ret;
+
+	/* These are just misnamed, they actually get/put from/to user an int */
+	switch (cmd) {
+	case EXT4_IOC32_GETFLAGS:
+		cmd = EXT4_IOC_GETFLAGS;
+		break;
+	case EXT4_IOC32_SETFLAGS:
+		cmd = EXT4_IOC_SETFLAGS;
+		break;
+	case EXT4_IOC32_GETVERSION:
+		cmd = EXT4_IOC_GETVERSION;
+		break;
+	case EXT4_IOC32_SETVERSION:
+		cmd = EXT4_IOC_SETVERSION;
+		break;
+	case EXT4_IOC32_GROUP_EXTEND:
+		cmd = EXT4_IOC_GROUP_EXTEND;
+		break;
+	case EXT4_IOC32_GETVERSION_OLD:
+		cmd = EXT4_IOC_GETVERSION_OLD;
+		break;
+	case EXT4_IOC32_SETVERSION_OLD:
+		cmd = EXT4_IOC_SETVERSION_OLD;
+		break;
+#ifdef CONFIG_JBD_DEBUG
+	case EXT4_IOC32_WAIT_FOR_READONLY:
+		cmd = EXT4_IOC_WAIT_FOR_READONLY;
+		break;
+#endif
+	case EXT4_IOC32_GETRSVSZ:
+		cmd = EXT4_IOC_GETRSVSZ;
+		break;
+	case EXT4_IOC32_SETRSVSZ:
+		cmd = EXT4_IOC_SETRSVSZ;
+		break;
+	case EXT4_IOC_GROUP_ADD:
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	lock_kernel();
+	ret = ext4_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
+	unlock_kernel();
+	return ret;
+}
+#endif
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
new file mode 100644
index 000000000000..8b1bd03d20f5
--- /dev/null
+++ b/fs/ext4/namei.c
@@ -0,0 +1,2395 @@
+/*
+ *  linux/fs/ext4/namei.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *  Directory entry file type support and forward compatibility hooks
+ *	for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
+ *  Hash Tree Directory indexing (c)
+ *	Daniel Phillips, 2001
+ *  Hash Tree Directory indexing porting
+ *	Christopher Li, 2002
+ *  Hash Tree Directory indexing cleanup
+ *	Theodore Ts'o, 2002
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/jbd2.h>
+#include <linux/time.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include <linux/smp_lock.h>
+
+#include "namei.h"
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * define how far ahead to read directories while searching them.
+ */
+#define NAMEI_RA_CHUNKS  2
+#define NAMEI_RA_BLOCKS  4
+#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
+
+static struct buffer_head *ext4_append(handle_t *handle,
+					struct inode *inode,
+					u32 *block, int *err)
+{
+	struct buffer_head *bh;
+
+	*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+
+	if ((bh = ext4_bread(handle, inode, *block, 1, err))) {
+		inode->i_size += inode->i_sb->s_blocksize;
+		EXT4_I(inode)->i_disksize = inode->i_size;
+		ext4_journal_get_write_access(handle,bh);
+	}
+	return bh;
+}
+
+#ifndef assert
+#define assert(test) J_ASSERT(test)
+#endif
+
+#ifndef swap
+#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
+#endif
+
+#ifdef DX_DEBUG
+#define dxtrace(command) command
+#else
+#define dxtrace(command)
+#endif
+
+struct fake_dirent
+{
+	__le32 inode;
+	__le16 rec_len;
+	u8 name_len;
+	u8 file_type;
+};
+
+struct dx_countlimit
+{
+	__le16 limit;
+	__le16 count;
+};
+
+struct dx_entry
+{
+	__le32 hash;
+	__le32 block;
+};
+
+/*
+ * dx_root_info is laid out so that if it should somehow get overlaid by a
+ * dirent the two low bits of the hash version will be zero.  Therefore, the
+ * hash version mod 4 should never be 0.  Sincerely, the paranoia department.
+ */
+
+struct dx_root
+{
+	struct fake_dirent dot;
+	char dot_name[4];
+	struct fake_dirent dotdot;
+	char dotdot_name[4];
+	struct dx_root_info
+	{
+		__le32 reserved_zero;
+		u8 hash_version;
+		u8 info_length; /* 8 */
+		u8 indirect_levels;
+		u8 unused_flags;
+	}
+	info;
+	struct dx_entry	entries[0];
+};
+
+struct dx_node
+{
+	struct fake_dirent fake;
+	struct dx_entry	entries[0];
+};
+
+
+struct dx_frame
+{
+	struct buffer_head *bh;
+	struct dx_entry *entries;
+	struct dx_entry *at;
+};
+
+struct dx_map_entry
+{
+	u32 hash;
+	u32 offs;
+};
+
+#ifdef CONFIG_EXT4_INDEX
+static inline unsigned dx_get_block (struct dx_entry *entry);
+static void dx_set_block (struct dx_entry *entry, unsigned value);
+static inline unsigned dx_get_hash (struct dx_entry *entry);
+static void dx_set_hash (struct dx_entry *entry, unsigned value);
+static unsigned dx_get_count (struct dx_entry *entries);
+static unsigned dx_get_limit (struct dx_entry *entries);
+static void dx_set_count (struct dx_entry *entries, unsigned value);
+static void dx_set_limit (struct dx_entry *entries, unsigned value);
+static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
+static unsigned dx_node_limit (struct inode *dir);
+static struct dx_frame *dx_probe(struct dentry *dentry,
+				 struct inode *dir,
+				 struct dx_hash_info *hinfo,
+				 struct dx_frame *frame,
+				 int *err);
+static void dx_release (struct dx_frame *frames);
+static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
+			struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+static void dx_sort_map(struct dx_map_entry *map, unsigned count);
+static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to,
+		struct dx_map_entry *offsets, int count);
+static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size);
+static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
+static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+				 struct dx_frame *frame,
+				 struct dx_frame *frames,
+				 __u32 *start_hash);
+static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
+		       struct ext4_dir_entry_2 **res_dir, int *err);
+static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+			     struct inode *inode);
+
+/*
+ * Future: use high four bits of block for coalesce-on-delete flags
+ * Mask them off for now.
+ */
+
+static inline unsigned dx_get_block (struct dx_entry *entry)
+{
+	return le32_to_cpu(entry->block) & 0x00ffffff;
+}
+
+static inline void dx_set_block (struct dx_entry *entry, unsigned value)
+{
+	entry->block = cpu_to_le32(value);
+}
+
+static inline unsigned dx_get_hash (struct dx_entry *entry)
+{
+	return le32_to_cpu(entry->hash);
+}
+
+static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
+{
+	entry->hash = cpu_to_le32(value);
+}
+
+static inline unsigned dx_get_count (struct dx_entry *entries)
+{
+	return le16_to_cpu(((struct dx_countlimit *) entries)->count);
+}
+
+static inline unsigned dx_get_limit (struct dx_entry *entries)
+{
+	return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
+}
+
+static inline void dx_set_count (struct dx_entry *entries, unsigned value)
+{
+	((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
+}
+
+static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
+{
+	((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+}
+
+static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
+{
+	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
+		EXT4_DIR_REC_LEN(2) - infosize;
+	return 0? 20: entry_space / sizeof(struct dx_entry);
+}
+
+static inline unsigned dx_node_limit (struct inode *dir)
+{
+	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
+	return 0? 22: entry_space / sizeof(struct dx_entry);
+}
+
+/*
+ * Debug
+ */
+#ifdef DX_DEBUG
+static void dx_show_index (char * label, struct dx_entry *entries)
+{
+	int i, n = dx_get_count (entries);
+        printk("%s index ", label);
+	for (i = 0; i < n; i++) {
+		printk("%x->%u ", i? dx_get_hash(entries + i) :
+				0, dx_get_block(entries + i));
+	}
+	printk("\n");
+}
+
+struct stats
+{
+	unsigned names;
+	unsigned space;
+	unsigned bcount;
+};
+
+static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de,
+				 int size, int show_names)
+{
+	unsigned names = 0, space = 0;
+	char *base = (char *) de;
+	struct dx_hash_info h = *hinfo;
+
+	printk("names: ");
+	while ((char *) de < base + size)
+	{
+		if (de->inode)
+		{
+			if (show_names)
+			{
+				int len = de->name_len;
+				char *name = de->name;
+				while (len--) printk("%c", *name++);
+				ext4fs_dirhash(de->name, de->name_len, &h);
+				printk(":%x.%u ", h.hash,
+				       ((char *) de - base));
+			}
+			space += EXT4_DIR_REC_LEN(de->name_len);
+			names++;
+		}
+		de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
+	}
+	printk("(%i)\n", names);
+	return (struct stats) { names, space, 1 };
+}
+
+struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
+			     struct dx_entry *entries, int levels)
+{
+	unsigned blocksize = dir->i_sb->s_blocksize;
+	unsigned count = dx_get_count (entries), names = 0, space = 0, i;
+	unsigned bcount = 0;
+	struct buffer_head *bh;
+	int err;
+	printk("%i indexed blocks...\n", count);
+	for (i = 0; i < count; i++, entries++)
+	{
+		u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
+		u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
+		struct stats stats;
+		printk("%s%3u:%03u hash %8x/%8x ",levels?"":"   ", i, block, hash, range);
+		if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue;
+		stats = levels?
+		   dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
+		   dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0);
+		names += stats.names;
+		space += stats.space;
+		bcount += stats.bcount;
+		brelse (bh);
+	}
+	if (bcount)
+		printk("%snames %u, fullness %u (%u%%)\n", levels?"":"   ",
+			names, space/bcount,(space/bcount)*100/blocksize);
+	return (struct stats) { names, space, bcount};
+}
+#endif /* DX_DEBUG */
+
+/*
+ * Probe for a directory leaf block to search.
+ *
+ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
+ * error in the directory index, and the caller should fall back to
+ * searching the directory normally.  The callers of dx_probe **MUST**
+ * check for this error code, and make sure it never gets reflected
+ * back to userspace.
+ */
+static struct dx_frame *
+dx_probe(struct dentry *dentry, struct inode *dir,
+	 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
+{
+	unsigned count, indirect;
+	struct dx_entry *at, *entries, *p, *q, *m;
+	struct dx_root *root;
+	struct buffer_head *bh;
+	struct dx_frame *frame = frame_in;
+	u32 hash;
+
+	frame->bh = NULL;
+	if (dentry)
+		dir = dentry->d_parent->d_inode;
+	if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
+		goto fail;
+	root = (struct dx_root *) bh->b_data;
+	if (root->info.hash_version != DX_HASH_TEA &&
+	    root->info.hash_version != DX_HASH_HALF_MD4 &&
+	    root->info.hash_version != DX_HASH_LEGACY) {
+		ext4_warning(dir->i_sb, __FUNCTION__,
+			     "Unrecognised inode hash code %d",
+			     root->info.hash_version);
+		brelse(bh);
+		*err = ERR_BAD_DX_DIR;
+		goto fail;
+	}
+	hinfo->hash_version = root->info.hash_version;
+	hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+	if (dentry)
+		ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
+	hash = hinfo->hash;
+
+	if (root->info.unused_flags & 1) {
+		ext4_warning(dir->i_sb, __FUNCTION__,
+			     "Unimplemented inode hash flags: %#06x",
+			     root->info.unused_flags);
+		brelse(bh);
+		*err = ERR_BAD_DX_DIR;
+		goto fail;
+	}
+
+	if ((indirect = root->info.indirect_levels) > 1) {
+		ext4_warning(dir->i_sb, __FUNCTION__,
+			     "Unimplemented inode hash depth: %#06x",
+			     root->info.indirect_levels);
+		brelse(bh);
+		*err = ERR_BAD_DX_DIR;
+		goto fail;
+	}
+
+	entries = (struct dx_entry *) (((char *)&root->info) +
+				       root->info.info_length);
+	assert(dx_get_limit(entries) == dx_root_limit(dir,
+						      root->info.info_length));
+	dxtrace (printk("Look up %x", hash));
+	while (1)
+	{
+		count = dx_get_count(entries);
+		assert (count && count <= dx_get_limit(entries));
+		p = entries + 1;
+		q = entries + count - 1;
+		while (p <= q)
+		{
+			m = p + (q - p)/2;
+			dxtrace(printk("."));
+			if (dx_get_hash(m) > hash)
+				q = m - 1;
+			else
+				p = m + 1;
+		}
+
+		if (0) // linear search cross check
+		{
+			unsigned n = count - 1;
+			at = entries;
+			while (n--)
+			{
+				dxtrace(printk(","));
+				if (dx_get_hash(++at) > hash)
+				{
+					at--;
+					break;
+				}
+			}
+			assert (at == p - 1);
+		}
+
+		at = p - 1;
+		dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
+		frame->bh = bh;
+		frame->entries = entries;
+		frame->at = at;
+		if (!indirect--) return frame;
+		if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
+			goto fail2;
+		at = entries = ((struct dx_node *) bh->b_data)->entries;
+		assert (dx_get_limit(entries) == dx_node_limit (dir));
+		frame++;
+	}
+fail2:
+	while (frame >= frame_in) {
+		brelse(frame->bh);
+		frame--;
+	}
+fail:
+	return NULL;
+}
+
+static void dx_release (struct dx_frame *frames)
+{
+	if (frames[0].bh == NULL)
+		return;
+
+	if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
+		brelse(frames[1].bh);
+	brelse(frames[0].bh);
+}
+
+/*
+ * This function increments the frame pointer to search the next leaf
+ * block, and reads in the necessary intervening nodes if the search
+ * should be necessary.  Whether or not the search is necessary is
+ * controlled by the hash parameter.  If the hash value is even, then
+ * the search is only continued if the next block starts with that
+ * hash value.  This is used if we are searching for a specific file.
+ *
+ * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
+ *
+ * This function returns 1 if the caller should continue to search,
+ * or 0 if it should not.  If there is an error reading one of the
+ * index blocks, it will a negative error code.
+ *
+ * If start_hash is non-null, it will be filled in with the starting
+ * hash of the next page.
+ */
+static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+				 struct dx_frame *frame,
+				 struct dx_frame *frames,
+				 __u32 *start_hash)
+{
+	struct dx_frame *p;
+	struct buffer_head *bh;
+	int err, num_frames = 0;
+	__u32 bhash;
+
+	p = frame;
+	/*
+	 * Find the next leaf page by incrementing the frame pointer.
+	 * If we run out of entries in the interior node, loop around and
+	 * increment pointer in the parent node.  When we break out of
+	 * this loop, num_frames indicates the number of interior
+	 * nodes need to be read.
+	 */
+	while (1) {
+		if (++(p->at) < p->entries + dx_get_count(p->entries))
+			break;
+		if (p == frames)
+			return 0;
+		num_frames++;
+		p--;
+	}
+
+	/*
+	 * If the hash is 1, then continue only if the next page has a
+	 * continuation hash of any value.  This is used for readdir
+	 * handling.  Otherwise, check to see if the hash matches the
+	 * desired contiuation hash.  If it doesn't, return since
+	 * there's no point to read in the successive index pages.
+	 */
+	bhash = dx_get_hash(p->at);
+	if (start_hash)
+		*start_hash = bhash;
+	if ((hash & 1) == 0) {
+		if ((bhash & ~1) != hash)
+			return 0;
+	}
+	/*
+	 * If the hash is HASH_NB_ALWAYS, we always go to the next
+	 * block so no check is necessary
+	 */
+	while (num_frames--) {
+		if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
+				      0, &err)))
+			return err; /* Failure */
+		p++;
+		brelse (p->bh);
+		p->bh = bh;
+		p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
+	}
+	return 1;
+}
+
+
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
+{
+	return (struct ext4_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
+}
+
+/*
+ * This function fills a red-black tree with information from a
+ * directory block.  It returns the number directory entries loaded
+ * into the tree.  If there is an error it is returned in err.
+ */
+static int htree_dirblock_to_tree(struct file *dir_file,
+				  struct inode *dir, int block,
+				  struct dx_hash_info *hinfo,
+				  __u32 start_hash, __u32 start_minor_hash)
+{
+	struct buffer_head *bh;
+	struct ext4_dir_entry_2 *de, *top;
+	int err, count = 0;
+
+	dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
+	if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
+		return err;
+
+	de = (struct ext4_dir_entry_2 *) bh->b_data;
+	top = (struct ext4_dir_entry_2 *) ((char *) de +
+					   dir->i_sb->s_blocksize -
+					   EXT4_DIR_REC_LEN(0));
+	for (; de < top; de = ext4_next_entry(de)) {
+		ext4fs_dirhash(de->name, de->name_len, hinfo);
+		if ((hinfo->hash < start_hash) ||
+		    ((hinfo->hash == start_hash) &&
+		     (hinfo->minor_hash < start_minor_hash)))
+			continue;
+		if (de->inode == 0)
+			continue;
+		if ((err = ext4_htree_store_dirent(dir_file,
+				   hinfo->hash, hinfo->minor_hash, de)) != 0) {
+			brelse(bh);
+			return err;
+		}
+		count++;
+	}
+	brelse(bh);
+	return count;
+}
+
+
+/*
+ * This function fills a red-black tree with information from a
+ * directory.  We start scanning the directory in hash order, starting
+ * at start_hash and start_minor_hash.
+ *
+ * This function returns the number of entries inserted into the tree,
+ * or a negative error code.
+ */
+int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+			 __u32 start_minor_hash, __u32 *next_hash)
+{
+	struct dx_hash_info hinfo;
+	struct ext4_dir_entry_2 *de;
+	struct dx_frame frames[2], *frame;
+	struct inode *dir;
+	int block, err;
+	int count = 0;
+	int ret;
+	__u32 hashval;
+
+	dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
+		       start_minor_hash));
+	dir = dir_file->f_dentry->d_inode;
+	if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
+		hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+		hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+		count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
+					       start_hash, start_minor_hash);
+		*next_hash = ~0;
+		return count;
+	}
+	hinfo.hash = start_hash;
+	hinfo.minor_hash = 0;
+	frame = dx_probe(NULL, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
+	if (!frame)
+		return err;
+
+	/* Add '.' and '..' from the htree header */
+	if (!start_hash && !start_minor_hash) {
+		de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
+		if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0)
+			goto errout;
+		count++;
+	}
+	if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
+		de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
+		de = ext4_next_entry(de);
+		if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
+			goto errout;
+		count++;
+	}
+
+	while (1) {
+		block = dx_get_block(frame->at);
+		ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
+					     start_hash, start_minor_hash);
+		if (ret < 0) {
+			err = ret;
+			goto errout;
+		}
+		count += ret;
+		hashval = ~0;
+		ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
+					    frame, frames, &hashval);
+		*next_hash = hashval;
+		if (ret < 0) {
+			err = ret;
+			goto errout;
+		}
+		/*
+		 * Stop if:  (a) there are no more entries, or
+		 * (b) we have inserted at least one entry and the
+		 * next hash value is not a continuation
+		 */
+		if ((ret == 0) ||
+		    (count && ((hashval & 1) == 0)))
+			break;
+	}
+	dx_release(frames);
+	dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
+		       count, *next_hash));
+	return count;
+errout:
+	dx_release(frames);
+	return (err);
+}
+
+
+/*
+ * Directory block splitting, compacting
+ */
+
+static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
+			struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
+{
+	int count = 0;
+	char *base = (char *) de;
+	struct dx_hash_info h = *hinfo;
+
+	while ((char *) de < base + size)
+	{
+		if (de->name_len && de->inode) {
+			ext4fs_dirhash(de->name, de->name_len, &h);
+			map_tail--;
+			map_tail->hash = h.hash;
+			map_tail->offs = (u32) ((char *) de - base);
+			count++;
+			cond_resched();
+		}
+		/* XXX: do we need to check rec_len == 0 case? -Chris */
+		de = (struct ext4_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
+	}
+	return count;
+}
+
+static void dx_sort_map (struct dx_map_entry *map, unsigned count)
+{
+	struct dx_map_entry *p, *q, *top = map + count - 1;
+	int more;
+	/* Combsort until bubble sort doesn't suck */
+	while (count > 2) {
+		count = count*10/13;
+		if (count - 9 < 2) /* 9, 10 -> 11 */
+			count = 11;
+		for (p = top, q = p - count; q >= map; p--, q--)
+			if (p->hash < q->hash)
+				swap(*p, *q);
+	}
+	/* Garden variety bubble sort */
+	do {
+		more = 0;
+		q = top;
+		while (q-- > map) {
+			if (q[1].hash >= q[0].hash)
+				continue;
+			swap(*(q+1), *q);
+			more = 1;
+		}
+	} while(more);
+}
+
+static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
+{
+	struct dx_entry *entries = frame->entries;
+	struct dx_entry *old = frame->at, *new = old + 1;
+	int count = dx_get_count(entries);
+
+	assert(count < dx_get_limit(entries));
+	assert(old < entries + count);
+	memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
+	dx_set_hash(new, hash);
+	dx_set_block(new, block);
+	dx_set_count(entries, count + 1);
+}
+#endif
+
+
+static void ext4_update_dx_flag(struct inode *inode)
+{
+	if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+				     EXT4_FEATURE_COMPAT_DIR_INDEX))
+		EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
+}
+
+/*
+ * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
+ *
+ * `len <= EXT4_NAME_LEN' is guaranteed by caller.
+ * `de != NULL' is guaranteed by caller.
+ */
+static inline int ext4_match (int len, const char * const name,
+			      struct ext4_dir_entry_2 * de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+/*
+ * Returns 0 if not found, -1 on failure, and 1 on success
+ */
+static inline int search_dirblock(struct buffer_head * bh,
+				  struct inode *dir,
+				  struct dentry *dentry,
+				  unsigned long offset,
+				  struct ext4_dir_entry_2 ** res_dir)
+{
+	struct ext4_dir_entry_2 * de;
+	char * dlimit;
+	int de_len;
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+
+	de = (struct ext4_dir_entry_2 *) bh->b_data;
+	dlimit = bh->b_data + dir->i_sb->s_blocksize;
+	while ((char *) de < dlimit) {
+		/* this code is executed quadratically often */
+		/* do minimal checking `by hand' */
+
+		if ((char *) de + namelen <= dlimit &&
+		    ext4_match (namelen, name, de)) {
+			/* found a match - just to be sure, do a full check */
+			if (!ext4_check_dir_entry("ext4_find_entry",
+						  dir, de, bh, offset))
+				return -1;
+			*res_dir = de;
+			return 1;
+		}
+		/* prevent looping on a bad block */
+		de_len = le16_to_cpu(de->rec_len);
+		if (de_len <= 0)
+			return -1;
+		offset += de_len;
+		de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
+	}
+	return 0;
+}
+
+
+/*
+ *	ext4_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the cache buffer in which the entry was found, and the entry
+ * itself (as a parameter - res_dir). It does NOT read the inode of the
+ * entry - you'll have to do that yourself if you want to.
+ *
+ * The returned buffer_head has ->b_count elevated.  The caller is expected
+ * to brelse() it when appropriate.
+ */
+static struct buffer_head * ext4_find_entry (struct dentry *dentry,
+					struct ext4_dir_entry_2 ** res_dir)
+{
+	struct super_block * sb;
+	struct buffer_head * bh_use[NAMEI_RA_SIZE];
+	struct buffer_head * bh, *ret = NULL;
+	unsigned long start, block, b;
+	int ra_max = 0;		/* Number of bh's in the readahead
+				   buffer, bh_use[] */
+	int ra_ptr = 0;		/* Current index into readahead
+				   buffer */
+	int num = 0;
+	int nblocks, i, err;
+	struct inode *dir = dentry->d_parent->d_inode;
+	int namelen;
+	const u8 *name;
+	unsigned blocksize;
+
+	*res_dir = NULL;
+	sb = dir->i_sb;
+	blocksize = sb->s_blocksize;
+	namelen = dentry->d_name.len;
+	name = dentry->d_name.name;
+	if (namelen > EXT4_NAME_LEN)
+		return NULL;
+#ifdef CONFIG_EXT4_INDEX
+	if (is_dx(dir)) {
+		bh = ext4_dx_find_entry(dentry, res_dir, &err);
+		/*
+		 * On success, or if the error was file not found,
+		 * return.  Otherwise, fall back to doing a search the
+		 * old fashioned way.
+		 */
+		if (bh || (err != ERR_BAD_DX_DIR))
+			return bh;
+		dxtrace(printk("ext4_find_entry: dx failed, falling back\n"));
+	}
+#endif
+	nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
+	start = EXT4_I(dir)->i_dir_start_lookup;
+	if (start >= nblocks)
+		start = 0;
+	block = start;
+restart:
+	do {
+		/*
+		 * We deal with the read-ahead logic here.
+		 */
+		if (ra_ptr >= ra_max) {
+			/* Refill the readahead buffer */
+			ra_ptr = 0;
+			b = block;
+			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
+				/*
+				 * Terminate if we reach the end of the
+				 * directory and must wrap, or if our
+				 * search has finished at this block.
+				 */
+				if (b >= nblocks || (num && block == start)) {
+					bh_use[ra_max] = NULL;
+					break;
+				}
+				num++;
+				bh = ext4_getblk(NULL, dir, b++, 0, &err);
+				bh_use[ra_max] = bh;
+				if (bh)
+					ll_rw_block(READ_META, 1, &bh);
+			}
+		}
+		if ((bh = bh_use[ra_ptr++]) == NULL)
+			goto next;
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			/* read error, skip block & hope for the best */
+			ext4_error(sb, __FUNCTION__, "reading directory #%lu "
+				   "offset %lu", dir->i_ino, block);
+			brelse(bh);
+			goto next;
+		}
+		i = search_dirblock(bh, dir, dentry,
+			    block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
+		if (i == 1) {
+			EXT4_I(dir)->i_dir_start_lookup = block;
+			ret = bh;
+			goto cleanup_and_exit;
+		} else {
+			brelse(bh);
+			if (i < 0)
+				goto cleanup_and_exit;
+		}
+	next:
+		if (++block >= nblocks)
+			block = 0;
+	} while (block != start);
+
+	/*
+	 * If the directory has grown while we were searching, then
+	 * search the last part of the directory before giving up.
+	 */
+	block = nblocks;
+	nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
+	if (block < nblocks) {
+		start = 0;
+		goto restart;
+	}
+
+cleanup_and_exit:
+	/* Clean up the read-ahead blocks */
+	for (; ra_ptr < ra_max; ra_ptr++)
+		brelse (bh_use[ra_ptr]);
+	return ret;
+}
+
+#ifdef CONFIG_EXT4_INDEX
+static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
+		       struct ext4_dir_entry_2 **res_dir, int *err)
+{
+	struct super_block * sb;
+	struct dx_hash_info	hinfo;
+	u32 hash;
+	struct dx_frame frames[2], *frame;
+	struct ext4_dir_entry_2 *de, *top;
+	struct buffer_head *bh;
+	unsigned long block;
+	int retval;
+	int namelen = dentry->d_name.len;
+	const u8 *name = dentry->d_name.name;
+	struct inode *dir = dentry->d_parent->d_inode;
+
+	sb = dir->i_sb;
+	/* NFS may look up ".." - look at dx_root directory block */
+	if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
+		if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
+			return NULL;
+	} else {
+		frame = frames;
+		frame->bh = NULL;			/* for dx_release() */
+		frame->at = (struct dx_entry *)frames;	/* hack for zero entry*/
+		dx_set_block(frame->at, 0);		/* dx_root block is 0 */
+	}
+	hash = hinfo.hash;
+	do {
+		block = dx_get_block(frame->at);
+		if (!(bh = ext4_bread (NULL,dir, block, 0, err)))
+			goto errout;
+		de = (struct ext4_dir_entry_2 *) bh->b_data;
+		top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
+				       EXT4_DIR_REC_LEN(0));
+		for (; de < top; de = ext4_next_entry(de))
+		if (ext4_match (namelen, name, de)) {
+			if (!ext4_check_dir_entry("ext4_find_entry",
+						  dir, de, bh,
+				  (block<<EXT4_BLOCK_SIZE_BITS(sb))
+					  +((char *)de - bh->b_data))) {
+				brelse (bh);
+				goto errout;
+			}
+			*res_dir = de;
+			dx_release (frames);
+			return bh;
+		}
+		brelse (bh);
+		/* Check to see if we should continue to search */
+		retval = ext4_htree_next_block(dir, hash, frame,
+					       frames, NULL);
+		if (retval < 0) {
+			ext4_warning(sb, __FUNCTION__,
+			     "error reading index page in directory #%lu",
+			     dir->i_ino);
+			*err = retval;
+			goto errout;
+		}
+	} while (retval == 1);
+
+	*err = -ENOENT;
+errout:
+	dxtrace(printk("%s not found\n", name));
+	dx_release (frames);
+	return NULL;
+}
+#endif
+
+static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode * inode;
+	struct ext4_dir_entry_2 * de;
+	struct buffer_head * bh;
+
+	if (dentry->d_name.len > EXT4_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	bh = ext4_find_entry(dentry, &de);
+	inode = NULL;
+	if (bh) {
+		unsigned long ino = le32_to_cpu(de->inode);
+		brelse (bh);
+		if (!ext4_valid_inum(dir->i_sb, ino)) {
+			ext4_error(dir->i_sb, "ext4_lookup",
+				   "bad inode number: %lu", ino);
+			inode = NULL;
+		} else
+			inode = iget(dir->i_sb, ino);
+
+		if (!inode)
+			return ERR_PTR(-EACCES);
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+
+struct dentry *ext4_get_parent(struct dentry *child)
+{
+	unsigned long ino;
+	struct dentry *parent;
+	struct inode *inode;
+	struct dentry dotdot;
+	struct ext4_dir_entry_2 * de;
+	struct buffer_head *bh;
+
+	dotdot.d_name.name = "..";
+	dotdot.d_name.len = 2;
+	dotdot.d_parent = child; /* confusing, isn't it! */
+
+	bh = ext4_find_entry(&dotdot, &de);
+	inode = NULL;
+	if (!bh)
+		return ERR_PTR(-ENOENT);
+	ino = le32_to_cpu(de->inode);
+	brelse(bh);
+
+	if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
+		ext4_error(child->d_inode->i_sb, "ext4_get_parent",
+			   "bad inode number: %lu", ino);
+		inode = NULL;
+	} else
+		inode = iget(child->d_inode->i_sb, ino);
+
+	if (!inode)
+		return ERR_PTR(-EACCES);
+
+	parent = d_alloc_anon(inode);
+	if (!parent) {
+		iput(inode);
+		parent = ERR_PTR(-ENOMEM);
+	}
+	return parent;
+}
+
+#define S_SHIFT 12
+static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= EXT4_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= EXT4_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= EXT4_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= EXT4_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= EXT4_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= EXT4_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= EXT4_FT_SYMLINK,
+};
+
+static inline void ext4_set_de_type(struct super_block *sb,
+				struct ext4_dir_entry_2 *de,
+				umode_t mode) {
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
+		de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
+#ifdef CONFIG_EXT4_INDEX
+static struct ext4_dir_entry_2 *
+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
+{
+	unsigned rec_len = 0;
+
+	while (count--) {
+		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs);
+		rec_len = EXT4_DIR_REC_LEN(de->name_len);
+		memcpy (to, de, rec_len);
+		((struct ext4_dir_entry_2 *) to)->rec_len =
+				cpu_to_le16(rec_len);
+		de->inode = 0;
+		map++;
+		to += rec_len;
+	}
+	return (struct ext4_dir_entry_2 *) (to - rec_len);
+}
+
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
+{
+	struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
+	unsigned rec_len = 0;
+
+	prev = to = de;
+	while ((char*)de < base + size) {
+		next = (struct ext4_dir_entry_2 *) ((char *) de +
+						    le16_to_cpu(de->rec_len));
+		if (de->inode && de->name_len) {
+			rec_len = EXT4_DIR_REC_LEN(de->name_len);
+			if (de > to)
+				memmove(to, de, rec_len);
+			to->rec_len = cpu_to_le16(rec_len);
+			prev = to;
+			to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
+		}
+		de = next;
+	}
+	return prev;
+}
+
+static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+			struct buffer_head **bh,struct dx_frame *frame,
+			struct dx_hash_info *hinfo, int *error)
+{
+	unsigned blocksize = dir->i_sb->s_blocksize;
+	unsigned count, continued;
+	struct buffer_head *bh2;
+	u32 newblock;
+	u32 hash2;
+	struct dx_map_entry *map;
+	char *data1 = (*bh)->b_data, *data2;
+	unsigned split;
+	struct ext4_dir_entry_2 *de = NULL, *de2;
+	int	err;
+
+	bh2 = ext4_append (handle, dir, &newblock, error);
+	if (!(bh2)) {
+		brelse(*bh);
+		*bh = NULL;
+		goto errout;
+	}
+
+	BUFFER_TRACE(*bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, *bh);
+	if (err) {
+	journal_error:
+		brelse(*bh);
+		brelse(bh2);
+		*bh = NULL;
+		ext4_std_error(dir->i_sb, err);
+		goto errout;
+	}
+	BUFFER_TRACE(frame->bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, frame->bh);
+	if (err)
+		goto journal_error;
+
+	data2 = bh2->b_data;
+
+	/* create map in the end of data2 block */
+	map = (struct dx_map_entry *) (data2 + blocksize);
+	count = dx_make_map ((struct ext4_dir_entry_2 *) data1,
+			     blocksize, hinfo, map);
+	map -= count;
+	split = count/2; // need to adjust to actual middle
+	dx_sort_map (map, count);
+	hash2 = map[split].hash;
+	continued = hash2 == map[split - 1].hash;
+	dxtrace(printk("Split block %i at %x, %i/%i\n",
+		dx_get_block(frame->at), hash2, split, count-split));
+
+	/* Fancy dance to stay within two buffers */
+	de2 = dx_move_dirents(data1, data2, map + split, count - split);
+	de = dx_pack_dirents(data1,blocksize);
+	de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+	de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
+	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
+	dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
+
+	/* Which block gets the new entry? */
+	if (hinfo->hash >= hash2)
+	{
+		swap(*bh, bh2);
+		de = de2;
+	}
+	dx_insert_block (frame, hash2 + continued, newblock);
+	err = ext4_journal_dirty_metadata (handle, bh2);
+	if (err)
+		goto journal_error;
+	err = ext4_journal_dirty_metadata (handle, frame->bh);
+	if (err)
+		goto journal_error;
+	brelse (bh2);
+	dxtrace(dx_show_index ("frame", frame->entries));
+errout:
+	return de;
+}
+#endif
+
+
+/*
+ * Add a new entry into a directory (leaf) block.  If de is non-NULL,
+ * it points to a directory entry which is guaranteed to be large
+ * enough for new directory entry.  If de is NULL, then
+ * add_dirent_to_buf will attempt search the directory block for
+ * space.  It will return -ENOSPC if no space is available, and -EIO
+ * and -EEXIST if directory entry already exists.
+ *
+ * NOTE!  bh is NOT released in the case where ENOSPC is returned.  In
+ * all other cases bh is released.
+ */
+static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
+			     struct inode *inode, struct ext4_dir_entry_2 *de,
+			     struct buffer_head * bh)
+{
+	struct inode	*dir = dentry->d_parent->d_inode;
+	const char	*name = dentry->d_name.name;
+	int		namelen = dentry->d_name.len;
+	unsigned long	offset = 0;
+	unsigned short	reclen;
+	int		nlen, rlen, err;
+	char		*top;
+
+	reclen = EXT4_DIR_REC_LEN(namelen);
+	if (!de) {
+		de = (struct ext4_dir_entry_2 *)bh->b_data;
+		top = bh->b_data + dir->i_sb->s_blocksize - reclen;
+		while ((char *) de <= top) {
+			if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
+						  bh, offset)) {
+				brelse (bh);
+				return -EIO;
+			}
+			if (ext4_match (namelen, name, de)) {
+				brelse (bh);
+				return -EEXIST;
+			}
+			nlen = EXT4_DIR_REC_LEN(de->name_len);
+			rlen = le16_to_cpu(de->rec_len);
+			if ((de->inode? rlen - nlen: rlen) >= reclen)
+				break;
+			de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+			offset += rlen;
+		}
+		if ((char *) de > top)
+			return -ENOSPC;
+	}
+	BUFFER_TRACE(bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, bh);
+	if (err) {
+		ext4_std_error(dir->i_sb, err);
+		brelse(bh);
+		return err;
+	}
+
+	/* By now the buffer is marked for journaling */
+	nlen = EXT4_DIR_REC_LEN(de->name_len);
+	rlen = le16_to_cpu(de->rec_len);
+	if (de->inode) {
+		struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
+		de1->rec_len = cpu_to_le16(rlen - nlen);
+		de->rec_len = cpu_to_le16(nlen);
+		de = de1;
+	}
+	de->file_type = EXT4_FT_UNKNOWN;
+	if (inode) {
+		de->inode = cpu_to_le32(inode->i_ino);
+		ext4_set_de_type(dir->i_sb, de, inode->i_mode);
+	} else
+		de->inode = 0;
+	de->name_len = namelen;
+	memcpy (de->name, name, namelen);
+	/*
+	 * XXX shouldn't update any times until successful
+	 * completion of syscall, but too many callers depend
+	 * on this.
+	 *
+	 * XXX similarly, too many callers depend on
+	 * ext4_new_inode() setting the times, but error
+	 * recovery deletes the inode, so the worst that can
+	 * happen is that the times are slightly out of date
+	 * and/or different from the directory change time.
+	 */
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	ext4_update_dx_flag(dir);
+	dir->i_version++;
+	ext4_mark_inode_dirty(handle, dir);
+	BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+	err = ext4_journal_dirty_metadata(handle, bh);
+	if (err)
+		ext4_std_error(dir->i_sb, err);
+	brelse(bh);
+	return 0;
+}
+
+#ifdef CONFIG_EXT4_INDEX
+/*
+ * This converts a one block unindexed directory to a 3 block indexed
+ * directory, and adds the dentry to the indexed directory.
+ */
+static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
+			    struct inode *inode, struct buffer_head *bh)
+{
+	struct inode	*dir = dentry->d_parent->d_inode;
+	const char	*name = dentry->d_name.name;
+	int		namelen = dentry->d_name.len;
+	struct buffer_head *bh2;
+	struct dx_root	*root;
+	struct dx_frame	frames[2], *frame;
+	struct dx_entry *entries;
+	struct ext4_dir_entry_2	*de, *de2;
+	char		*data1, *top;
+	unsigned	len;
+	int		retval;
+	unsigned	blocksize;
+	struct dx_hash_info hinfo;
+	u32		block;
+	struct fake_dirent *fde;
+
+	blocksize =  dir->i_sb->s_blocksize;
+	dxtrace(printk("Creating index\n"));
+	retval = ext4_journal_get_write_access(handle, bh);
+	if (retval) {
+		ext4_std_error(dir->i_sb, retval);
+		brelse(bh);
+		return retval;
+	}
+	root = (struct dx_root *) bh->b_data;
+
+	bh2 = ext4_append (handle, dir, &block, &retval);
+	if (!(bh2)) {
+		brelse(bh);
+		return retval;
+	}
+	EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
+	data1 = bh2->b_data;
+
+	/* The 0th block becomes the root, move the dirents out */
+	fde = &root->dotdot;
+	de = (struct ext4_dir_entry_2 *)((char *)fde + le16_to_cpu(fde->rec_len));
+	len = ((char *) root) + blocksize - (char *) de;
+	memcpy (data1, de, len);
+	de = (struct ext4_dir_entry_2 *) data1;
+	top = data1 + len;
+	while ((char *)(de2=(void*)de+le16_to_cpu(de->rec_len)) < top)
+		de = de2;
+	de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
+	/* Initialize the root; the dot dirents already exist */
+	de = (struct ext4_dir_entry_2 *) (&root->dotdot);
+	de->rec_len = cpu_to_le16(blocksize - EXT4_DIR_REC_LEN(2));
+	memset (&root->info, 0, sizeof(root->info));
+	root->info.info_length = sizeof(root->info);
+	root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+	entries = root->entries;
+	dx_set_block (entries, 1);
+	dx_set_count (entries, 1);
+	dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
+
+	/* Initialize as for dx_probe */
+	hinfo.hash_version = root->info.hash_version;
+	hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+	ext4fs_dirhash(name, namelen, &hinfo);
+	frame = frames;
+	frame->entries = entries;
+	frame->at = entries;
+	frame->bh = bh;
+	bh = bh2;
+	de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
+	dx_release (frames);
+	if (!(de))
+		return retval;
+
+	return add_dirent_to_buf(handle, dentry, inode, de, bh);
+}
+#endif
+
+/*
+ *	ext4_add_entry()
+ *
+ * adds a file entry to the specified directory, using the same
+ * semantics as ext4_find_entry(). It returns NULL if it failed.
+ *
+ * NOTE!! The inode part of 'de' is left at 0 - which means you
+ * may not sleep between calling this and putting something into
+ * the entry, as someone else might have used it while you slept.
+ */
+static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
+	struct inode *inode)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	unsigned long offset;
+	struct buffer_head * bh;
+	struct ext4_dir_entry_2 *de;
+	struct super_block * sb;
+	int	retval;
+#ifdef CONFIG_EXT4_INDEX
+	int	dx_fallback=0;
+#endif
+	unsigned blocksize;
+	u32 block, blocks;
+
+	sb = dir->i_sb;
+	blocksize = sb->s_blocksize;
+	if (!dentry->d_name.len)
+		return -EINVAL;
+#ifdef CONFIG_EXT4_INDEX
+	if (is_dx(dir)) {
+		retval = ext4_dx_add_entry(handle, dentry, inode);
+		if (!retval || (retval != ERR_BAD_DX_DIR))
+			return retval;
+		EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
+		dx_fallback++;
+		ext4_mark_inode_dirty(handle, dir);
+	}
+#endif
+	blocks = dir->i_size >> sb->s_blocksize_bits;
+	for (block = 0, offset = 0; block < blocks; block++) {
+		bh = ext4_bread(handle, dir, block, 0, &retval);
+		if(!bh)
+			return retval;
+		retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+		if (retval != -ENOSPC)
+			return retval;
+
+#ifdef CONFIG_EXT4_INDEX
+		if (blocks == 1 && !dx_fallback &&
+		    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
+			return make_indexed_dir(handle, dentry, inode, bh);
+#endif
+		brelse(bh);
+	}
+	bh = ext4_append(handle, dir, &block, &retval);
+	if (!bh)
+		return retval;
+	de = (struct ext4_dir_entry_2 *) bh->b_data;
+	de->inode = 0;
+	de->rec_len = cpu_to_le16(blocksize);
+	return add_dirent_to_buf(handle, dentry, inode, de, bh);
+}
+
+#ifdef CONFIG_EXT4_INDEX
+/*
+ * Returns 0 for success, or a negative error value
+ */
+static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
+			     struct inode *inode)
+{
+	struct dx_frame frames[2], *frame;
+	struct dx_entry *entries, *at;
+	struct dx_hash_info hinfo;
+	struct buffer_head * bh;
+	struct inode *dir = dentry->d_parent->d_inode;
+	struct super_block * sb = dir->i_sb;
+	struct ext4_dir_entry_2 *de;
+	int err;
+
+	frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
+	if (!frame)
+		return err;
+	entries = frame->entries;
+	at = frame->at;
+
+	if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
+		goto cleanup;
+
+	BUFFER_TRACE(bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, bh);
+	if (err)
+		goto journal_error;
+
+	err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+	if (err != -ENOSPC) {
+		bh = NULL;
+		goto cleanup;
+	}
+
+	/* Block full, should compress but for now just split */
+	dxtrace(printk("using %u of %u node entries\n",
+		       dx_get_count(entries), dx_get_limit(entries)));
+	/* Need to split index? */
+	if (dx_get_count(entries) == dx_get_limit(entries)) {
+		u32 newblock;
+		unsigned icount = dx_get_count(entries);
+		int levels = frame - frames;
+		struct dx_entry *entries2;
+		struct dx_node *node2;
+		struct buffer_head *bh2;
+
+		if (levels && (dx_get_count(frames->entries) ==
+			       dx_get_limit(frames->entries))) {
+			ext4_warning(sb, __FUNCTION__,
+				     "Directory index full!");
+			err = -ENOSPC;
+			goto cleanup;
+		}
+		bh2 = ext4_append (handle, dir, &newblock, &err);
+		if (!(bh2))
+			goto cleanup;
+		node2 = (struct dx_node *)(bh2->b_data);
+		entries2 = node2->entries;
+		node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
+		node2->fake.inode = 0;
+		BUFFER_TRACE(frame->bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, frame->bh);
+		if (err)
+			goto journal_error;
+		if (levels) {
+			unsigned icount1 = icount/2, icount2 = icount - icount1;
+			unsigned hash2 = dx_get_hash(entries + icount1);
+			dxtrace(printk("Split index %i/%i\n", icount1, icount2));
+
+			BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+			err = ext4_journal_get_write_access(handle,
+							     frames[0].bh);
+			if (err)
+				goto journal_error;
+
+			memcpy ((char *) entries2, (char *) (entries + icount1),
+				icount2 * sizeof(struct dx_entry));
+			dx_set_count (entries, icount1);
+			dx_set_count (entries2, icount2);
+			dx_set_limit (entries2, dx_node_limit(dir));
+
+			/* Which index block gets the new entry? */
+			if (at - entries >= icount1) {
+				frame->at = at = at - entries - icount1 + entries2;
+				frame->entries = entries = entries2;
+				swap(frame->bh, bh2);
+			}
+			dx_insert_block (frames + 0, hash2, newblock);
+			dxtrace(dx_show_index ("node", frames[1].entries));
+			dxtrace(dx_show_index ("node",
+			       ((struct dx_node *) bh2->b_data)->entries));
+			err = ext4_journal_dirty_metadata(handle, bh2);
+			if (err)
+				goto journal_error;
+			brelse (bh2);
+		} else {
+			dxtrace(printk("Creating second level index...\n"));
+			memcpy((char *) entries2, (char *) entries,
+			       icount * sizeof(struct dx_entry));
+			dx_set_limit(entries2, dx_node_limit(dir));
+
+			/* Set up root */
+			dx_set_count(entries, 1);
+			dx_set_block(entries + 0, newblock);
+			((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
+
+			/* Add new access path frame */
+			frame = frames + 1;
+			frame->at = at = at - entries + entries2;
+			frame->entries = entries = entries2;
+			frame->bh = bh2;
+			err = ext4_journal_get_write_access(handle,
+							     frame->bh);
+			if (err)
+				goto journal_error;
+		}
+		ext4_journal_dirty_metadata(handle, frames[0].bh);
+	}
+	de = do_split(handle, dir, &bh, frame, &hinfo, &err);
+	if (!de)
+		goto cleanup;
+	err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+	bh = NULL;
+	goto cleanup;
+
+journal_error:
+	ext4_std_error(dir->i_sb, err);
+cleanup:
+	if (bh)
+		brelse(bh);
+	dx_release(frames);
+	return err;
+}
+#endif
+
+/*
+ * ext4_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+static int ext4_delete_entry (handle_t *handle,
+			      struct inode * dir,
+			      struct ext4_dir_entry_2 * de_del,
+			      struct buffer_head * bh)
+{
+	struct ext4_dir_entry_2 * de, * pde;
+	int i;
+
+	i = 0;
+	pde = NULL;
+	de = (struct ext4_dir_entry_2 *) bh->b_data;
+	while (i < bh->b_size) {
+		if (!ext4_check_dir_entry("ext4_delete_entry", dir, de, bh, i))
+			return -EIO;
+		if (de == de_del)  {
+			BUFFER_TRACE(bh, "get_write_access");
+			ext4_journal_get_write_access(handle, bh);
+			if (pde)
+				pde->rec_len =
+					cpu_to_le16(le16_to_cpu(pde->rec_len) +
+						    le16_to_cpu(de->rec_len));
+			else
+				de->inode = 0;
+			dir->i_version++;
+			BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
+			ext4_journal_dirty_metadata(handle, bh);
+			return 0;
+		}
+		i += le16_to_cpu(de->rec_len);
+		pde = de;
+		de = (struct ext4_dir_entry_2 *)
+			((char *) de + le16_to_cpu(de->rec_len));
+	}
+	return -ENOENT;
+}
+
+/*
+ * ext4_mark_inode_dirty is somewhat expensive, so unlike ext2 we
+ * do not perform it in these functions.  We perform it at the call site,
+ * if it is needed.
+ */
+static inline void ext4_inc_count(handle_t *handle, struct inode *inode)
+{
+	inc_nlink(inode);
+}
+
+static inline void ext4_dec_count(handle_t *handle, struct inode *inode)
+{
+	drop_nlink(inode);
+}
+
+static int ext4_add_nondir(handle_t *handle,
+		struct dentry *dentry, struct inode *inode)
+{
+	int err = ext4_add_entry(handle, dentry, inode);
+	if (!err) {
+		ext4_mark_inode_dirty(handle, inode);
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	ext4_dec_count(handle, inode);
+	iput(inode);
+	return err;
+}
+
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int ext4_create (struct inode * dir, struct dentry * dentry, int mode,
+		struct nameidata *nd)
+{
+	handle_t *handle;
+	struct inode * inode;
+	int err, retries = 0;
+
+retry:
+	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+					2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	inode = ext4_new_inode (handle, dir, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		inode->i_op = &ext4_file_inode_operations;
+		inode->i_fop = &ext4_file_operations;
+		ext4_set_aops(inode);
+		err = ext4_add_nondir(handle, dentry, inode);
+	}
+	ext4_journal_stop(handle);
+	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+static int ext4_mknod (struct inode * dir, struct dentry *dentry,
+			int mode, dev_t rdev)
+{
+	handle_t *handle;
+	struct inode *inode;
+	int err, retries = 0;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+retry:
+	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+					2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	inode = ext4_new_inode (handle, dir, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		init_special_inode(inode, inode->i_mode, rdev);
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+		inode->i_op = &ext4_special_inode_operations;
+#endif
+		err = ext4_add_nondir(handle, dentry, inode);
+	}
+	ext4_journal_stop(handle);
+	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+{
+	handle_t *handle;
+	struct inode * inode;
+	struct buffer_head * dir_block;
+	struct ext4_dir_entry_2 * de;
+	int err, retries = 0;
+
+	if (dir->i_nlink >= EXT4_LINK_MAX)
+		return -EMLINK;
+
+retry:
+	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+					2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	inode = ext4_new_inode (handle, dir, S_IFDIR | mode);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_stop;
+
+	inode->i_op = &ext4_dir_inode_operations;
+	inode->i_fop = &ext4_dir_operations;
+	inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+	dir_block = ext4_bread (handle, inode, 0, 1, &err);
+	if (!dir_block) {
+		drop_nlink(inode); /* is this nlink == 0? */
+		ext4_mark_inode_dirty(handle, inode);
+		iput (inode);
+		goto out_stop;
+	}
+	BUFFER_TRACE(dir_block, "get_write_access");
+	ext4_journal_get_write_access(handle, dir_block);
+	de = (struct ext4_dir_entry_2 *) dir_block->b_data;
+	de->inode = cpu_to_le32(inode->i_ino);
+	de->name_len = 1;
+	de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de->name_len));
+	strcpy (de->name, ".");
+	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
+	de = (struct ext4_dir_entry_2 *)
+			((char *) de + le16_to_cpu(de->rec_len));
+	de->inode = cpu_to_le32(dir->i_ino);
+	de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT4_DIR_REC_LEN(1));
+	de->name_len = 2;
+	strcpy (de->name, "..");
+	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
+	inode->i_nlink = 2;
+	BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
+	ext4_journal_dirty_metadata(handle, dir_block);
+	brelse (dir_block);
+	ext4_mark_inode_dirty(handle, inode);
+	err = ext4_add_entry (handle, dentry, inode);
+	if (err) {
+		inode->i_nlink = 0;
+		ext4_mark_inode_dirty(handle, inode);
+		iput (inode);
+		goto out_stop;
+	}
+	inc_nlink(dir);
+	ext4_update_dx_flag(dir);
+	ext4_mark_inode_dirty(handle, dir);
+	d_instantiate(dentry, inode);
+out_stop:
+	ext4_journal_stop(handle);
+	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+static int empty_dir (struct inode * inode)
+{
+	unsigned long offset;
+	struct buffer_head * bh;
+	struct ext4_dir_entry_2 * de, * de1;
+	struct super_block * sb;
+	int err = 0;
+
+	sb = inode->i_sb;
+	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
+	    !(bh = ext4_bread (NULL, inode, 0, 0, &err))) {
+		if (err)
+			ext4_error(inode->i_sb, __FUNCTION__,
+				   "error %d reading directory #%lu offset 0",
+				   err, inode->i_ino);
+		else
+			ext4_warning(inode->i_sb, __FUNCTION__,
+				     "bad directory (dir #%lu) - no data block",
+				     inode->i_ino);
+		return 1;
+	}
+	de = (struct ext4_dir_entry_2 *) bh->b_data;
+	de1 = (struct ext4_dir_entry_2 *)
+			((char *) de + le16_to_cpu(de->rec_len));
+	if (le32_to_cpu(de->inode) != inode->i_ino ||
+			!le32_to_cpu(de1->inode) ||
+			strcmp (".", de->name) ||
+			strcmp ("..", de1->name)) {
+		ext4_warning (inode->i_sb, "empty_dir",
+			      "bad directory (dir #%lu) - no `.' or `..'",
+			      inode->i_ino);
+		brelse (bh);
+		return 1;
+	}
+	offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
+	de = (struct ext4_dir_entry_2 *)
+			((char *) de1 + le16_to_cpu(de1->rec_len));
+	while (offset < inode->i_size ) {
+		if (!bh ||
+			(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+			err = 0;
+			brelse (bh);
+			bh = ext4_bread (NULL, inode,
+				offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
+			if (!bh) {
+				if (err)
+					ext4_error(sb, __FUNCTION__,
+						   "error %d reading directory"
+						   " #%lu offset %lu",
+						   err, inode->i_ino, offset);
+				offset += sb->s_blocksize;
+				continue;
+			}
+			de = (struct ext4_dir_entry_2 *) bh->b_data;
+		}
+		if (!ext4_check_dir_entry("empty_dir", inode, de, bh, offset)) {
+			de = (struct ext4_dir_entry_2 *)(bh->b_data +
+							 sb->s_blocksize);
+			offset = (offset | (sb->s_blocksize - 1)) + 1;
+			continue;
+		}
+		if (le32_to_cpu(de->inode)) {
+			brelse (bh);
+			return 0;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ext4_dir_entry_2 *)
+				((char *) de + le16_to_cpu(de->rec_len));
+	}
+	brelse (bh);
+	return 1;
+}
+
+/* ext4_orphan_add() links an unlinked or truncated inode into a list of
+ * such inodes, starting at the superblock, in case we crash before the
+ * file is closed/deleted, or in case the inode truncate spans multiple
+ * transactions and the last transaction is not recovered after a crash.
+ *
+ * At filesystem recovery time, we walk this list deleting unlinked
+ * inodes and truncating linked inodes in ext4_orphan_cleanup().
+ */
+int ext4_orphan_add(handle_t *handle, struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ext4_iloc iloc;
+	int err = 0, rc;
+
+	lock_super(sb);
+	if (!list_empty(&EXT4_I(inode)->i_orphan))
+		goto out_unlock;
+
+	/* Orphan handling is only valid for files with data blocks
+	 * being truncated, or files being unlinked. */
+
+	/* @@@ FIXME: Observation from aviro:
+	 * I think I can trigger J_ASSERT in ext4_orphan_add().  We block
+	 * here (on lock_super()), so race with ext4_link() which might bump
+	 * ->i_nlink. For, say it, character device. Not a regular file,
+	 * not a directory, not a symlink and ->i_nlink > 0.
+	 */
+	J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+		S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+
+	BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+	if (err)
+		goto out_unlock;
+
+	err = ext4_reserve_inode_write(handle, inode, &iloc);
+	if (err)
+		goto out_unlock;
+
+	/* Insert this inode at the head of the on-disk orphan list... */
+	NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
+	EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+	err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+	if (!err)
+		err = rc;
+
+	/* Only add to the head of the in-memory list if all the
+	 * previous operations succeeded.  If the orphan_add is going to
+	 * fail (possibly taking the journal offline), we can't risk
+	 * leaving the inode on the orphan list: stray orphan-list
+	 * entries can cause panics at unmount time.
+	 *
+	 * This is safe: on error we're going to ignore the orphan list
+	 * anyway on the next recovery. */
+	if (!err)
+		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+
+	jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
+	jbd_debug(4, "orphan inode %lu will point to %d\n",
+			inode->i_ino, NEXT_ORPHAN(inode));
+out_unlock:
+	unlock_super(sb);
+	ext4_std_error(inode->i_sb, err);
+	return err;
+}
+
+/*
+ * ext4_orphan_del() removes an unlinked or truncated inode from the list
+ * of such inodes stored on disk, because it is finally being cleaned up.
+ */
+int ext4_orphan_del(handle_t *handle, struct inode *inode)
+{
+	struct list_head *prev;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_sb_info *sbi;
+	unsigned long ino_next;
+	struct ext4_iloc iloc;
+	int err = 0;
+
+	lock_super(inode->i_sb);
+	if (list_empty(&ei->i_orphan)) {
+		unlock_super(inode->i_sb);
+		return 0;
+	}
+
+	ino_next = NEXT_ORPHAN(inode);
+	prev = ei->i_orphan.prev;
+	sbi = EXT4_SB(inode->i_sb);
+
+	jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+
+	list_del_init(&ei->i_orphan);
+
+	/* If we're on an error path, we may not have a valid
+	 * transaction handle with which to update the orphan list on
+	 * disk, but we still need to remove the inode from the linked
+	 * list in memory. */
+	if (!handle)
+		goto out;
+
+	err = ext4_reserve_inode_write(handle, inode, &iloc);
+	if (err)
+		goto out_err;
+
+	if (prev == &sbi->s_orphan) {
+		jbd_debug(4, "superblock will point to %lu\n", ino_next);
+		BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+		if (err)
+			goto out_brelse;
+		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+		err = ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+	} else {
+		struct ext4_iloc iloc2;
+		struct inode *i_prev =
+			&list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
+
+		jbd_debug(4, "orphan inode %lu will point to %lu\n",
+			  i_prev->i_ino, ino_next);
+		err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
+		if (err)
+			goto out_brelse;
+		NEXT_ORPHAN(i_prev) = ino_next;
+		err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
+	}
+	if (err)
+		goto out_brelse;
+	NEXT_ORPHAN(inode) = 0;
+	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+
+out_err:
+	ext4_std_error(inode->i_sb, err);
+out:
+	unlock_super(inode->i_sb);
+	return err;
+
+out_brelse:
+	brelse(iloc.bh);
+	goto out_err;
+}
+
+static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
+{
+	int retval;
+	struct inode * inode;
+	struct buffer_head * bh;
+	struct ext4_dir_entry_2 * de;
+	handle_t *handle;
+
+	/* Initialize quotas before so that eventual writes go in
+	 * separate transaction */
+	DQUOT_INIT(dentry->d_inode);
+	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	retval = -ENOENT;
+	bh = ext4_find_entry (dentry, &de);
+	if (!bh)
+		goto end_rmdir;
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	inode = dentry->d_inode;
+
+	retval = -EIO;
+	if (le32_to_cpu(de->inode) != inode->i_ino)
+		goto end_rmdir;
+
+	retval = -ENOTEMPTY;
+	if (!empty_dir (inode))
+		goto end_rmdir;
+
+	retval = ext4_delete_entry(handle, dir, de, bh);
+	if (retval)
+		goto end_rmdir;
+	if (inode->i_nlink != 2)
+		ext4_warning (inode->i_sb, "ext4_rmdir",
+			      "empty directory has nlink!=2 (%d)",
+			      inode->i_nlink);
+	inode->i_version++;
+	clear_nlink(inode);
+	/* There's no need to set i_disksize: the fact that i_nlink is
+	 * zero will ensure that the right thing happens during any
+	 * recovery. */
+	inode->i_size = 0;
+	ext4_orphan_add(handle, inode);
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	ext4_mark_inode_dirty(handle, inode);
+	drop_nlink(dir);
+	ext4_update_dx_flag(dir);
+	ext4_mark_inode_dirty(handle, dir);
+
+end_rmdir:
+	ext4_journal_stop(handle);
+	brelse (bh);
+	return retval;
+}
+
+static int ext4_unlink(struct inode * dir, struct dentry *dentry)
+{
+	int retval;
+	struct inode * inode;
+	struct buffer_head * bh;
+	struct ext4_dir_entry_2 * de;
+	handle_t *handle;
+
+	/* Initialize quotas before so that eventual writes go
+	 * in separate transaction */
+	DQUOT_INIT(dentry->d_inode);
+	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	retval = -ENOENT;
+	bh = ext4_find_entry (dentry, &de);
+	if (!bh)
+		goto end_unlink;
+
+	inode = dentry->d_inode;
+
+	retval = -EIO;
+	if (le32_to_cpu(de->inode) != inode->i_ino)
+		goto end_unlink;
+
+	if (!inode->i_nlink) {
+		ext4_warning (inode->i_sb, "ext4_unlink",
+			      "Deleting nonexistent file (%lu), %d",
+			      inode->i_ino, inode->i_nlink);
+		inode->i_nlink = 1;
+	}
+	retval = ext4_delete_entry(handle, dir, de, bh);
+	if (retval)
+		goto end_unlink;
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	ext4_update_dx_flag(dir);
+	ext4_mark_inode_dirty(handle, dir);
+	drop_nlink(inode);
+	if (!inode->i_nlink)
+		ext4_orphan_add(handle, inode);
+	inode->i_ctime = dir->i_ctime;
+	ext4_mark_inode_dirty(handle, inode);
+	retval = 0;
+
+end_unlink:
+	ext4_journal_stop(handle);
+	brelse (bh);
+	return retval;
+}
+
+static int ext4_symlink (struct inode * dir,
+		struct dentry *dentry, const char * symname)
+{
+	handle_t *handle;
+	struct inode * inode;
+	int l, err, retries = 0;
+
+	l = strlen(symname)+1;
+	if (l > dir->i_sb->s_blocksize)
+		return -ENAMETOOLONG;
+
+retry:
+	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
+					2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_stop;
+
+	if (l > sizeof (EXT4_I(inode)->i_data)) {
+		inode->i_op = &ext4_symlink_inode_operations;
+		ext4_set_aops(inode);
+		/*
+		 * page_symlink() calls into ext4_prepare/commit_write.
+		 * We have a transaction open.  All is sweetness.  It also sets
+		 * i_size in generic_commit_write().
+		 */
+		err = __page_symlink(inode, symname, l,
+				mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+		if (err) {
+			ext4_dec_count(handle, inode);
+			ext4_mark_inode_dirty(handle, inode);
+			iput (inode);
+			goto out_stop;
+		}
+	} else {
+		inode->i_op = &ext4_fast_symlink_inode_operations;
+		memcpy((char*)&EXT4_I(inode)->i_data,symname,l);
+		inode->i_size = l-1;
+	}
+	EXT4_I(inode)->i_disksize = inode->i_size;
+	err = ext4_add_nondir(handle, dentry, inode);
+out_stop:
+	ext4_journal_stop(handle);
+	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+static int ext4_link (struct dentry * old_dentry,
+		struct inode * dir, struct dentry *dentry)
+{
+	handle_t *handle;
+	struct inode *inode = old_dentry->d_inode;
+	int err, retries = 0;
+
+	if (inode->i_nlink >= EXT4_LINK_MAX)
+		return -EMLINK;
+
+retry:
+	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+					EXT4_INDEX_EXTRA_TRANS_BLOCKS);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(dir))
+		handle->h_sync = 1;
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	ext4_inc_count(handle, inode);
+	atomic_inc(&inode->i_count);
+
+	err = ext4_add_nondir(handle, dentry, inode);
+	ext4_journal_stop(handle);
+	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+		goto retry;
+	return err;
+}
+
+#define PARENT_INO(buffer) \
+	((struct ext4_dir_entry_2 *) ((char *) buffer + \
+	le16_to_cpu(((struct ext4_dir_entry_2 *) buffer)->rec_len)))->inode
+
+/*
+ * Anybody can rename anything with this: the permission checks are left to the
+ * higher-level routines.
+ */
+static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
+			   struct inode * new_dir,struct dentry *new_dentry)
+{
+	handle_t *handle;
+	struct inode * old_inode, * new_inode;
+	struct buffer_head * old_bh, * new_bh, * dir_bh;
+	struct ext4_dir_entry_2 * old_de, * new_de;
+	int retval;
+
+	old_bh = new_bh = dir_bh = NULL;
+
+	/* Initialize quotas before so that eventual writes go
+	 * in separate transaction */
+	if (new_dentry->d_inode)
+		DQUOT_INIT(new_dentry->d_inode);
+	handle = ext4_journal_start(old_dir, 2 *
+					EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
+					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+		handle->h_sync = 1;
+
+	old_bh = ext4_find_entry (old_dentry, &old_de);
+	/*
+	 *  Check for inode number is _not_ due to possible IO errors.
+	 *  We might rmdir the source, keep it as pwd of some process
+	 *  and merrily kill the link to whatever was created under the
+	 *  same name. Goodbye sticky bit ;-<
+	 */
+	old_inode = old_dentry->d_inode;
+	retval = -ENOENT;
+	if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
+		goto end_rename;
+
+	new_inode = new_dentry->d_inode;
+	new_bh = ext4_find_entry (new_dentry, &new_de);
+	if (new_bh) {
+		if (!new_inode) {
+			brelse (new_bh);
+			new_bh = NULL;
+		}
+	}
+	if (S_ISDIR(old_inode->i_mode)) {
+		if (new_inode) {
+			retval = -ENOTEMPTY;
+			if (!empty_dir (new_inode))
+				goto end_rename;
+		}
+		retval = -EIO;
+		dir_bh = ext4_bread (handle, old_inode, 0, 0, &retval);
+		if (!dir_bh)
+			goto end_rename;
+		if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+			goto end_rename;
+		retval = -EMLINK;
+		if (!new_inode && new_dir!=old_dir &&
+				new_dir->i_nlink >= EXT4_LINK_MAX)
+			goto end_rename;
+	}
+	if (!new_bh) {
+		retval = ext4_add_entry (handle, new_dentry, old_inode);
+		if (retval)
+			goto end_rename;
+	} else {
+		BUFFER_TRACE(new_bh, "get write access");
+		ext4_journal_get_write_access(handle, new_bh);
+		new_de->inode = cpu_to_le32(old_inode->i_ino);
+		if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
+					      EXT4_FEATURE_INCOMPAT_FILETYPE))
+			new_de->file_type = old_de->file_type;
+		new_dir->i_version++;
+		BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata");
+		ext4_journal_dirty_metadata(handle, new_bh);
+		brelse(new_bh);
+		new_bh = NULL;
+	}
+
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+	 * rename.
+	 */
+	old_inode->i_ctime = CURRENT_TIME_SEC;
+	ext4_mark_inode_dirty(handle, old_inode);
+
+	/*
+	 * ok, that's it
+	 */
+	if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
+	    old_de->name_len != old_dentry->d_name.len ||
+	    strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
+	    (retval = ext4_delete_entry(handle, old_dir,
+					old_de, old_bh)) == -ENOENT) {
+		/* old_de could have moved from under us during htree split, so
+		 * make sure that we are deleting the right entry.  We might
+		 * also be pointing to a stale entry in the unused part of
+		 * old_bh so just checking inum and the name isn't enough. */
+		struct buffer_head *old_bh2;
+		struct ext4_dir_entry_2 *old_de2;
+
+		old_bh2 = ext4_find_entry(old_dentry, &old_de2);
+		if (old_bh2) {
+			retval = ext4_delete_entry(handle, old_dir,
+						   old_de2, old_bh2);
+			brelse(old_bh2);
+		}
+	}
+	if (retval) {
+		ext4_warning(old_dir->i_sb, "ext4_rename",
+				"Deleting old file (%lu), %d, error=%d",
+				old_dir->i_ino, old_dir->i_nlink, retval);
+	}
+
+	if (new_inode) {
+		drop_nlink(new_inode);
+		new_inode->i_ctime = CURRENT_TIME_SEC;
+	}
+	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+	ext4_update_dx_flag(old_dir);
+	if (dir_bh) {
+		BUFFER_TRACE(dir_bh, "get_write_access");
+		ext4_journal_get_write_access(handle, dir_bh);
+		PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
+		BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata");
+		ext4_journal_dirty_metadata(handle, dir_bh);
+		drop_nlink(old_dir);
+		if (new_inode) {
+			drop_nlink(new_inode);
+		} else {
+			inc_nlink(new_dir);
+			ext4_update_dx_flag(new_dir);
+			ext4_mark_inode_dirty(handle, new_dir);
+		}
+	}
+	ext4_mark_inode_dirty(handle, old_dir);
+	if (new_inode) {
+		ext4_mark_inode_dirty(handle, new_inode);
+		if (!new_inode->i_nlink)
+			ext4_orphan_add(handle, new_inode);
+	}
+	retval = 0;
+
+end_rename:
+	brelse (dir_bh);
+	brelse (old_bh);
+	brelse (new_bh);
+	ext4_journal_stop(handle);
+	return retval;
+}
+
+/*
+ * directories can handle most operations...
+ */
+struct inode_operations ext4_dir_inode_operations = {
+	.create		= ext4_create,
+	.lookup		= ext4_lookup,
+	.link		= ext4_link,
+	.unlink		= ext4_unlink,
+	.symlink	= ext4_symlink,
+	.mkdir		= ext4_mkdir,
+	.rmdir		= ext4_rmdir,
+	.mknod		= ext4_mknod,
+	.rename		= ext4_rename,
+	.setattr	= ext4_setattr,
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext4_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.permission	= ext4_permission,
+};
+
+struct inode_operations ext4_special_inode_operations = {
+	.setattr	= ext4_setattr,
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext4_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.permission	= ext4_permission,
+};
diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h
new file mode 100644
index 000000000000..5e4dfff36a00
--- /dev/null
+++ b/fs/ext4/namei.h
@@ -0,0 +1,8 @@
+/*  linux/fs/ext4/namei.h
+ *
+ * Copyright (C) 2005 Simtec Electronics
+ *	Ben Dooks <ben@simtec.co.uk>
+ *
+*/
+
+extern struct dentry *ext4_get_parent(struct dentry *child);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
new file mode 100644
index 000000000000..1e9578052cd3
--- /dev/null
+++ b/fs/ext4/resize.c
@@ -0,0 +1,1045 @@
+/*
+ *  linux/fs/ext4/resize.c
+ *
+ * Support for resizing an ext4 filesystem while it is mounted.
+ *
+ * Copyright (C) 2001, 2002 Andreas Dilger <adilger@clusterfs.com>
+ *
+ * This could probably be made into a module, because it is not often in use.
+ */
+
+
+#define EXT4FS_DEBUG
+
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/ext4_jbd2.h>
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+
+
+#define outside(b, first, last)	((b) < (first) || (b) >= (last))
+#define inside(b, first, last)	((b) >= (first) && (b) < (last))
+
+static int verify_group_input(struct super_block *sb,
+			      struct ext4_new_group_data *input)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	ext4_fsblk_t start = ext4_blocks_count(es);
+	ext4_fsblk_t end = start + input->blocks_count;
+	unsigned group = input->group;
+	ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
+	unsigned overhead = ext4_bg_has_super(sb, group) ?
+		(1 + ext4_bg_num_gdb(sb, group) +
+		 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
+	ext4_fsblk_t metaend = start + overhead;
+	struct buffer_head *bh = NULL;
+	ext4_grpblk_t free_blocks_count, offset;
+	int err = -EINVAL;
+
+	input->free_blocks_count = free_blocks_count =
+		input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks "
+		       "(%d free, %u reserved)\n",
+		       ext4_bg_has_super(sb, input->group) ? "normal" :
+		       "no-super", input->group, input->blocks_count,
+		       free_blocks_count, input->reserved_blocks);
+
+	ext4_get_group_no_and_offset(sb, start, NULL, &offset);
+	if (group != sbi->s_groups_count)
+		ext4_warning(sb, __FUNCTION__,
+			     "Cannot add at group %u (only %lu groups)",
+			     input->group, sbi->s_groups_count);
+	else if (offset != 0)
+			ext4_warning(sb, __FUNCTION__, "Last group not full");
+	else if (input->reserved_blocks > input->blocks_count / 5)
+		ext4_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)",
+			     input->reserved_blocks);
+	else if (free_blocks_count < 0)
+		ext4_warning(sb, __FUNCTION__, "Bad blocks count %u",
+			     input->blocks_count);
+	else if (!(bh = sb_bread(sb, end - 1)))
+		ext4_warning(sb, __FUNCTION__,
+			     "Cannot read last block (%llu)",
+			     end - 1);
+	else if (outside(input->block_bitmap, start, end))
+		ext4_warning(sb, __FUNCTION__,
+			     "Block bitmap not in group (block %llu)",
+			     input->block_bitmap);
+	else if (outside(input->inode_bitmap, start, end))
+		ext4_warning(sb, __FUNCTION__,
+			     "Inode bitmap not in group (block %llu)",
+			     input->inode_bitmap);
+	else if (outside(input->inode_table, start, end) ||
+	         outside(itend - 1, start, end))
+		ext4_warning(sb, __FUNCTION__,
+			     "Inode table not in group (blocks %llu-%llu)",
+			     input->inode_table, itend - 1);
+	else if (input->inode_bitmap == input->block_bitmap)
+		ext4_warning(sb, __FUNCTION__,
+			     "Block bitmap same as inode bitmap (%llu)",
+			     input->block_bitmap);
+	else if (inside(input->block_bitmap, input->inode_table, itend))
+		ext4_warning(sb, __FUNCTION__,
+			     "Block bitmap (%llu) in inode table (%llu-%llu)",
+			     input->block_bitmap, input->inode_table, itend-1);
+	else if (inside(input->inode_bitmap, input->inode_table, itend))
+		ext4_warning(sb, __FUNCTION__,
+			     "Inode bitmap (%llu) in inode table (%llu-%llu)",
+			     input->inode_bitmap, input->inode_table, itend-1);
+	else if (inside(input->block_bitmap, start, metaend))
+		ext4_warning(sb, __FUNCTION__,
+			     "Block bitmap (%llu) in GDT table"
+			     " (%llu-%llu)",
+			     input->block_bitmap, start, metaend - 1);
+	else if (inside(input->inode_bitmap, start, metaend))
+		ext4_warning(sb, __FUNCTION__,
+			     "Inode bitmap (%llu) in GDT table"
+			     " (%llu-%llu)",
+			     input->inode_bitmap, start, metaend - 1);
+	else if (inside(input->inode_table, start, metaend) ||
+	         inside(itend - 1, start, metaend))
+		ext4_warning(sb, __FUNCTION__,
+			     "Inode table (%llu-%llu) overlaps"
+			     "GDT table (%llu-%llu)",
+			     input->inode_table, itend - 1, start, metaend - 1);
+	else
+		err = 0;
+	brelse(bh);
+
+	return err;
+}
+
+static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
+				  ext4_fsblk_t blk)
+{
+	struct buffer_head *bh;
+	int err;
+
+	bh = sb_getblk(sb, blk);
+	if (!bh)
+		return ERR_PTR(-EIO);
+	if ((err = ext4_journal_get_write_access(handle, bh))) {
+		brelse(bh);
+		bh = ERR_PTR(err);
+	} else {
+		lock_buffer(bh);
+		memset(bh->b_data, 0, sb->s_blocksize);
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+	}
+
+	return bh;
+}
+
+/*
+ * To avoid calling the atomic setbit hundreds or thousands of times, we only
+ * need to use it within a single byte (to ensure we get endianness right).
+ * We can use memset for the rest of the bitmap as there are no other users.
+ */
+static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
+{
+	int i;
+
+	if (start_bit >= end_bit)
+		return;
+
+	ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
+	for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
+		ext4_set_bit(i, bitmap);
+	if (i < end_bit)
+		memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
+}
+
+/*
+ * Set up the block and inode bitmaps, and the inode table for the new group.
+ * This doesn't need to be part of the main transaction, since we are only
+ * changing blocks outside the actual filesystem.  We still do journaling to
+ * ensure the recovery is correct in case of a failure just after resize.
+ * If any part of this fails, we simply abort the resize.
+ */
+static int setup_new_group_blocks(struct super_block *sb,
+				  struct ext4_new_group_data *input)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group);
+	int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
+		le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0;
+	unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group);
+	struct buffer_head *bh;
+	handle_t *handle;
+	ext4_fsblk_t block;
+	ext4_grpblk_t bit;
+	int i;
+	int err = 0, err2;
+
+	handle = ext4_journal_start_sb(sb, reserved_gdb + gdblocks +
+				       2 + sbi->s_itb_per_group);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	lock_super(sb);
+	if (input->group != sbi->s_groups_count) {
+		err = -EBUSY;
+		goto exit_journal;
+	}
+
+	if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
+		err = PTR_ERR(bh);
+		goto exit_journal;
+	}
+
+	if (ext4_bg_has_super(sb, input->group)) {
+		ext4_debug("mark backup superblock %#04lx (+0)\n", start);
+		ext4_set_bit(0, bh->b_data);
+	}
+
+	/* Copy all of the GDT blocks into the backup in this group */
+	for (i = 0, bit = 1, block = start + 1;
+	     i < gdblocks; i++, block++, bit++) {
+		struct buffer_head *gdb;
+
+		ext4_debug("update backup group %#04lx (+%d)\n", block, bit);
+
+		gdb = sb_getblk(sb, block);
+		if (!gdb) {
+			err = -EIO;
+			goto exit_bh;
+		}
+		if ((err = ext4_journal_get_write_access(handle, gdb))) {
+			brelse(gdb);
+			goto exit_bh;
+		}
+		lock_buffer(bh);
+		memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, bh->b_size);
+		set_buffer_uptodate(gdb);
+		unlock_buffer(bh);
+		ext4_journal_dirty_metadata(handle, gdb);
+		ext4_set_bit(bit, bh->b_data);
+		brelse(gdb);
+	}
+
+	/* Zero out all of the reserved backup group descriptor table blocks */
+	for (i = 0, bit = gdblocks + 1, block = start + bit;
+	     i < reserved_gdb; i++, block++, bit++) {
+		struct buffer_head *gdb;
+
+		ext4_debug("clear reserved block %#04lx (+%d)\n", block, bit);
+
+		if (IS_ERR(gdb = bclean(handle, sb, block))) {
+			err = PTR_ERR(bh);
+			goto exit_bh;
+		}
+		ext4_journal_dirty_metadata(handle, gdb);
+		ext4_set_bit(bit, bh->b_data);
+		brelse(gdb);
+	}
+	ext4_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap,
+		   input->block_bitmap - start);
+	ext4_set_bit(input->block_bitmap - start, bh->b_data);
+	ext4_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap,
+		   input->inode_bitmap - start);
+	ext4_set_bit(input->inode_bitmap - start, bh->b_data);
+
+	/* Zero out all of the inode table blocks */
+	for (i = 0, block = input->inode_table, bit = block - start;
+	     i < sbi->s_itb_per_group; i++, bit++, block++) {
+		struct buffer_head *it;
+
+		ext4_debug("clear inode block %#04lx (+%d)\n", block, bit);
+		if (IS_ERR(it = bclean(handle, sb, block))) {
+			err = PTR_ERR(it);
+			goto exit_bh;
+		}
+		ext4_journal_dirty_metadata(handle, it);
+		brelse(it);
+		ext4_set_bit(bit, bh->b_data);
+	}
+	mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
+			bh->b_data);
+	ext4_journal_dirty_metadata(handle, bh);
+	brelse(bh);
+
+	/* Mark unused entries in inode bitmap used */
+	ext4_debug("clear inode bitmap %#04x (+%ld)\n",
+		   input->inode_bitmap, input->inode_bitmap - start);
+	if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) {
+		err = PTR_ERR(bh);
+		goto exit_journal;
+	}
+
+	mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
+			bh->b_data);
+	ext4_journal_dirty_metadata(handle, bh);
+exit_bh:
+	brelse(bh);
+
+exit_journal:
+	unlock_super(sb);
+	if ((err2 = ext4_journal_stop(handle)) && !err)
+		err = err2;
+
+	return err;
+}
+
+
+/*
+ * Iterate through the groups which hold BACKUP superblock/GDT copies in an
+ * ext4 filesystem.  The counters should be initialized to 1, 5, and 7 before
+ * calling this for the first time.  In a sparse filesystem it will be the
+ * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ...
+ * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ...
+ */
+static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
+				  unsigned *five, unsigned *seven)
+{
+	unsigned *min = three;
+	int mult = 3;
+	unsigned ret;
+
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+		ret = *min;
+		*min += 1;
+		return ret;
+	}
+
+	if (*five < *min) {
+		min = five;
+		mult = 5;
+	}
+	if (*seven < *min) {
+		min = seven;
+		mult = 7;
+	}
+
+	ret = *min;
+	*min *= mult;
+
+	return ret;
+}
+
+/*
+ * Check that all of the backup GDT blocks are held in the primary GDT block.
+ * It is assumed that they are stored in group order.  Returns the number of
+ * groups in current filesystem that have BACKUPS, or -ve error code.
+ */
+static int verify_reserved_gdb(struct super_block *sb,
+			       struct buffer_head *primary)
+{
+	const ext4_fsblk_t blk = primary->b_blocknr;
+	const unsigned long end = EXT4_SB(sb)->s_groups_count;
+	unsigned three = 1;
+	unsigned five = 5;
+	unsigned seven = 7;
+	unsigned grp;
+	__le32 *p = (__le32 *)primary->b_data;
+	int gdbackups = 0;
+
+	while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) {
+		if (le32_to_cpu(*p++) !=
+		    grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){
+			ext4_warning(sb, __FUNCTION__,
+				     "reserved GDT %llu"
+				     " missing grp %d (%llu)",
+				     blk, grp,
+				     grp *
+				     (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
+				     blk);
+			return -EINVAL;
+		}
+		if (++gdbackups > EXT4_ADDR_PER_BLOCK(sb))
+			return -EFBIG;
+	}
+
+	return gdbackups;
+}
+
+/*
+ * Called when we need to bring a reserved group descriptor table block into
+ * use from the resize inode.  The primary copy of the new GDT block currently
+ * is an indirect block (under the double indirect block in the resize inode).
+ * The new backup GDT blocks will be stored as leaf blocks in this indirect
+ * block, in group order.  Even though we know all the block numbers we need,
+ * we check to ensure that the resize inode has actually reserved these blocks.
+ *
+ * Don't need to update the block bitmaps because the blocks are still in use.
+ *
+ * We get all of the error cases out of the way, so that we are sure to not
+ * fail once we start modifying the data on disk, because JBD has no rollback.
+ */
+static int add_new_gdb(handle_t *handle, struct inode *inode,
+		       struct ext4_new_group_data *input,
+		       struct buffer_head **primary)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
+	ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
+	struct buffer_head **o_group_desc, **n_group_desc;
+	struct buffer_head *dind;
+	int gdbackups;
+	struct ext4_iloc iloc;
+	__le32 *data;
+	int err;
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG
+		       "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
+		       gdb_num);
+
+	/*
+	 * If we are not using the primary superblock/GDT copy don't resize,
+	 * because the user tools have no way of handling this.  Probably a
+	 * bad time to do it anyways.
+	 */
+	if (EXT4_SB(sb)->s_sbh->b_blocknr !=
+	    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
+		ext4_warning(sb, __FUNCTION__,
+			"won't resize using backup superblock at %llu",
+			(unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
+		return -EPERM;
+	}
+
+	*primary = sb_bread(sb, gdblock);
+	if (!*primary)
+		return -EIO;
+
+	if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) {
+		err = gdbackups;
+		goto exit_bh;
+	}
+
+	data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
+	dind = sb_bread(sb, le32_to_cpu(*data));
+	if (!dind) {
+		err = -EIO;
+		goto exit_bh;
+	}
+
+	data = (__le32 *)dind->b_data;
+	if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
+		ext4_warning(sb, __FUNCTION__,
+			     "new group %u GDT block %llu not reserved",
+			     input->group, gdblock);
+		err = -EINVAL;
+		goto exit_dind;
+	}
+
+	if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh)))
+		goto exit_dind;
+
+	if ((err = ext4_journal_get_write_access(handle, *primary)))
+		goto exit_sbh;
+
+	if ((err = ext4_journal_get_write_access(handle, dind)))
+		goto exit_primary;
+
+	/* ext4_reserve_inode_write() gets a reference on the iloc */
+	if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
+		goto exit_dindj;
+
+	n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *),
+			GFP_KERNEL);
+	if (!n_group_desc) {
+		err = -ENOMEM;
+		ext4_warning (sb, __FUNCTION__,
+			      "not enough memory for %lu groups", gdb_num + 1);
+		goto exit_inode;
+	}
+
+	/*
+	 * Finally, we have all of the possible failures behind us...
+	 *
+	 * Remove new GDT block from inode double-indirect block and clear out
+	 * the new GDT block for use (which also "frees" the backup GDT blocks
+	 * from the reserved inode).  We don't need to change the bitmaps for
+	 * these blocks, because they are marked as in-use from being in the
+	 * reserved inode, and will become GDT blocks (primary and backup).
+	 */
+	data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
+	ext4_journal_dirty_metadata(handle, dind);
+	brelse(dind);
+	inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
+	ext4_mark_iloc_dirty(handle, inode, &iloc);
+	memset((*primary)->b_data, 0, sb->s_blocksize);
+	ext4_journal_dirty_metadata(handle, *primary);
+
+	o_group_desc = EXT4_SB(sb)->s_group_desc;
+	memcpy(n_group_desc, o_group_desc,
+	       EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+	n_group_desc[gdb_num] = *primary;
+	EXT4_SB(sb)->s_group_desc = n_group_desc;
+	EXT4_SB(sb)->s_gdb_count++;
+	kfree(o_group_desc);
+
+	es->s_reserved_gdt_blocks =
+		cpu_to_le16(le16_to_cpu(es->s_reserved_gdt_blocks) - 1);
+	ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+
+	return 0;
+
+exit_inode:
+	//ext4_journal_release_buffer(handle, iloc.bh);
+	brelse(iloc.bh);
+exit_dindj:
+	//ext4_journal_release_buffer(handle, dind);
+exit_primary:
+	//ext4_journal_release_buffer(handle, *primary);
+exit_sbh:
+	//ext4_journal_release_buffer(handle, *primary);
+exit_dind:
+	brelse(dind);
+exit_bh:
+	brelse(*primary);
+
+	ext4_debug("leaving with error %d\n", err);
+	return err;
+}
+
+/*
+ * Called when we are adding a new group which has a backup copy of each of
+ * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks.
+ * We need to add these reserved backup GDT blocks to the resize inode, so
+ * that they are kept for future resizing and not allocated to files.
+ *
+ * Each reserved backup GDT block will go into a different indirect block.
+ * The indirect blocks are actually the primary reserved GDT blocks,
+ * so we know in advance what their block numbers are.  We only get the
+ * double-indirect block to verify it is pointing to the primary reserved
+ * GDT blocks so we don't overwrite a data block by accident.  The reserved
+ * backup GDT blocks are stored in their reserved primary GDT block.
+ */
+static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
+			      struct ext4_new_group_data *input)
+{
+	struct super_block *sb = inode->i_sb;
+	int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
+	struct buffer_head **primary;
+	struct buffer_head *dind;
+	struct ext4_iloc iloc;
+	ext4_fsblk_t blk;
+	__le32 *data, *end;
+	int gdbackups = 0;
+	int res, i;
+	int err;
+
+	primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_KERNEL);
+	if (!primary)
+		return -ENOMEM;
+
+	data = EXT4_I(inode)->i_data + EXT4_DIND_BLOCK;
+	dind = sb_bread(sb, le32_to_cpu(*data));
+	if (!dind) {
+		err = -EIO;
+		goto exit_free;
+	}
+
+	blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count;
+	data = (__le32 *)dind->b_data + EXT4_SB(sb)->s_gdb_count;
+	end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb);
+
+	/* Get each reserved primary GDT block and verify it holds backups */
+	for (res = 0; res < reserved_gdb; res++, blk++) {
+		if (le32_to_cpu(*data) != blk) {
+			ext4_warning(sb, __FUNCTION__,
+				     "reserved block %llu"
+				     " not at offset %ld",
+				     blk,
+				     (long)(data - (__le32 *)dind->b_data));
+			err = -EINVAL;
+			goto exit_bh;
+		}
+		primary[res] = sb_bread(sb, blk);
+		if (!primary[res]) {
+			err = -EIO;
+			goto exit_bh;
+		}
+		if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) {
+			brelse(primary[res]);
+			err = gdbackups;
+			goto exit_bh;
+		}
+		if (++data >= end)
+			data = (__le32 *)dind->b_data;
+	}
+
+	for (i = 0; i < reserved_gdb; i++) {
+		if ((err = ext4_journal_get_write_access(handle, primary[i]))) {
+			/*
+			int j;
+			for (j = 0; j < i; j++)
+				ext4_journal_release_buffer(handle, primary[j]);
+			 */
+			goto exit_bh;
+		}
+	}
+
+	if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
+		goto exit_bh;
+
+	/*
+	 * Finally we can add each of the reserved backup GDT blocks from
+	 * the new group to its reserved primary GDT block.
+	 */
+	blk = input->group * EXT4_BLOCKS_PER_GROUP(sb);
+	for (i = 0; i < reserved_gdb; i++) {
+		int err2;
+		data = (__le32 *)primary[i]->b_data;
+		/* printk("reserving backup %lu[%u] = %lu\n",
+		       primary[i]->b_blocknr, gdbackups,
+		       blk + primary[i]->b_blocknr); */
+		data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
+		err2 = ext4_journal_dirty_metadata(handle, primary[i]);
+		if (!err)
+			err = err2;
+	}
+	inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9;
+	ext4_mark_iloc_dirty(handle, inode, &iloc);
+
+exit_bh:
+	while (--res >= 0)
+		brelse(primary[res]);
+	brelse(dind);
+
+exit_free:
+	kfree(primary);
+
+	return err;
+}
+
+/*
+ * Update the backup copies of the ext4 metadata.  These don't need to be part
+ * of the main resize transaction, because e2fsck will re-write them if there
+ * is a problem (basically only OOM will cause a problem).  However, we
+ * _should_ update the backups if possible, in case the primary gets trashed
+ * for some reason and we need to run e2fsck from a backup superblock.  The
+ * important part is that the new block and inode counts are in the backup
+ * superblocks, and the location of the new group metadata in the GDT backups.
+ *
+ * We do not need lock_super() for this, because these blocks are not
+ * otherwise touched by the filesystem code when it is mounted.  We don't
+ * need to worry about last changing from sbi->s_groups_count, because the
+ * worst that can happen is that we do not copy the full number of backups
+ * at this time.  The resize which changed s_groups_count will backup again.
+ */
+static void update_backups(struct super_block *sb,
+			   int blk_off, char *data, int size)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	const unsigned long last = sbi->s_groups_count;
+	const int bpg = EXT4_BLOCKS_PER_GROUP(sb);
+	unsigned three = 1;
+	unsigned five = 5;
+	unsigned seven = 7;
+	unsigned group;
+	int rest = sb->s_blocksize - size;
+	handle_t *handle;
+	int err = 0, err2;
+
+	handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
+	if (IS_ERR(handle)) {
+		group = 1;
+		err = PTR_ERR(handle);
+		goto exit_err;
+	}
+
+	while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) {
+		struct buffer_head *bh;
+
+		/* Out of journal space, and can't get more - abort - so sad */
+		if (handle->h_buffer_credits == 0 &&
+		    ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
+		    (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
+			break;
+
+		bh = sb_getblk(sb, group * bpg + blk_off);
+		if (!bh) {
+			err = -EIO;
+			break;
+		}
+		ext4_debug("update metadata backup %#04lx\n",
+			  (unsigned long)bh->b_blocknr);
+		if ((err = ext4_journal_get_write_access(handle, bh)))
+			break;
+		lock_buffer(bh);
+		memcpy(bh->b_data, data, size);
+		if (rest)
+			memset(bh->b_data + size, 0, rest);
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		ext4_journal_dirty_metadata(handle, bh);
+		brelse(bh);
+	}
+	if ((err2 = ext4_journal_stop(handle)) && !err)
+		err = err2;
+
+	/*
+	 * Ugh! Need to have e2fsck write the backup copies.  It is too
+	 * late to revert the resize, we shouldn't fail just because of
+	 * the backup copies (they are only needed in case of corruption).
+	 *
+	 * However, if we got here we have a journal problem too, so we
+	 * can't really start a transaction to mark the superblock.
+	 * Chicken out and just set the flag on the hope it will be written
+	 * to disk, and if not - we will simply wait until next fsck.
+	 */
+exit_err:
+	if (err) {
+		ext4_warning(sb, __FUNCTION__,
+			     "can't update backup for group %d (err %d), "
+			     "forcing fsck on next reboot", group, err);
+		sbi->s_mount_state &= ~EXT4_VALID_FS;
+		sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
+		mark_buffer_dirty(sbi->s_sbh);
+	}
+}
+
+/* Add group descriptor data to an existing or new group descriptor block.
+ * Ensure we handle all possible error conditions _before_ we start modifying
+ * the filesystem, because we cannot abort the transaction and not have it
+ * write the data to disk.
+ *
+ * If we are on a GDT block boundary, we need to get the reserved GDT block.
+ * Otherwise, we may need to add backup GDT blocks for a sparse group.
+ *
+ * We only need to hold the superblock lock while we are actually adding
+ * in the new group's counts to the superblock.  Prior to that we have
+ * not really "added" the group at all.  We re-check that we are still
+ * adding in the last group in case things have changed since verifying.
+ */
+int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
+		le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
+	struct buffer_head *primary = NULL;
+	struct ext4_group_desc *gdp;
+	struct inode *inode = NULL;
+	handle_t *handle;
+	int gdb_off, gdb_num;
+	int err, err2;
+
+	gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
+	gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
+
+	if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+		ext4_warning(sb, __FUNCTION__,
+			     "Can't resize non-sparse filesystem further");
+		return -EPERM;
+	}
+
+	if (ext4_blocks_count(es) + input->blocks_count <
+	    ext4_blocks_count(es)) {
+		ext4_warning(sb, __FUNCTION__, "blocks_count overflow\n");
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
+	    le32_to_cpu(es->s_inodes_count)) {
+		ext4_warning(sb, __FUNCTION__, "inodes_count overflow\n");
+		return -EINVAL;
+	}
+
+	if (reserved_gdb || gdb_off == 0) {
+		if (!EXT4_HAS_COMPAT_FEATURE(sb,
+					     EXT4_FEATURE_COMPAT_RESIZE_INODE)){
+			ext4_warning(sb, __FUNCTION__,
+				     "No reserved GDT blocks, can't resize");
+			return -EPERM;
+		}
+		inode = iget(sb, EXT4_RESIZE_INO);
+		if (!inode || is_bad_inode(inode)) {
+			ext4_warning(sb, __FUNCTION__,
+				     "Error opening resize inode");
+			iput(inode);
+			return -ENOENT;
+		}
+	}
+
+	if ((err = verify_group_input(sb, input)))
+		goto exit_put;
+
+	if ((err = setup_new_group_blocks(sb, input)))
+		goto exit_put;
+
+	/*
+	 * We will always be modifying at least the superblock and a GDT
+	 * block.  If we are adding a group past the last current GDT block,
+	 * we will also modify the inode and the dindirect block.  If we
+	 * are adding a group with superblock/GDT backups  we will also
+	 * modify each of the reserved GDT dindirect blocks.
+	 */
+	handle = ext4_journal_start_sb(sb,
+				       ext4_bg_has_super(sb, input->group) ?
+				       3 + reserved_gdb : 4);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto exit_put;
+	}
+
+	lock_super(sb);
+	if (input->group != sbi->s_groups_count) {
+		ext4_warning(sb, __FUNCTION__,
+			     "multiple resizers run on filesystem!");
+		err = -EBUSY;
+		goto exit_journal;
+	}
+
+	if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
+		goto exit_journal;
+
+	/*
+	 * We will only either add reserved group blocks to a backup group
+	 * or remove reserved blocks for the first group in a new group block.
+	 * Doing both would be mean more complex code, and sane people don't
+	 * use non-sparse filesystems anymore.  This is already checked above.
+	 */
+	if (gdb_off) {
+		primary = sbi->s_group_desc[gdb_num];
+		if ((err = ext4_journal_get_write_access(handle, primary)))
+			goto exit_journal;
+
+		if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) &&
+		    (err = reserve_backup_gdb(handle, inode, input)))
+			goto exit_journal;
+	} else if ((err = add_new_gdb(handle, inode, input, &primary)))
+		goto exit_journal;
+
+	/*
+	 * OK, now we've set up the new group.  Time to make it active.
+	 *
+	 * Current kernels don't lock all allocations via lock_super(),
+	 * so we have to be safe wrt. concurrent accesses the group
+	 * data.  So we need to be careful to set all of the relevant
+	 * group descriptor data etc. *before* we enable the group.
+	 *
+	 * The key field here is sbi->s_groups_count: as long as
+	 * that retains its old value, nobody is going to access the new
+	 * group.
+	 *
+	 * So first we update all the descriptor metadata for the new
+	 * group; then we update the total disk blocks count; then we
+	 * update the groups count to enable the group; then finally we
+	 * update the free space counts so that the system can start
+	 * using the new disk blocks.
+	 */
+
+	/* Update group descriptor block for new group */
+	gdp = (struct ext4_group_desc *)primary->b_data + gdb_off;
+
+	ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
+	ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
+	ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
+	gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
+	gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
+
+	/*
+	 * Make the new blocks and inodes valid next.  We do this before
+	 * increasing the group count so that once the group is enabled,
+	 * all of its blocks and inodes are already valid.
+	 *
+	 * We always allocate group-by-group, then block-by-block or
+	 * inode-by-inode within a group, so enabling these
+	 * blocks/inodes before the group is live won't actually let us
+	 * allocate the new space yet.
+	 */
+	ext4_blocks_count_set(es, ext4_blocks_count(es) +
+		input->blocks_count);
+	es->s_inodes_count = cpu_to_le32(le32_to_cpu(es->s_inodes_count) +
+		EXT4_INODES_PER_GROUP(sb));
+
+	/*
+	 * We need to protect s_groups_count against other CPUs seeing
+	 * inconsistent state in the superblock.
+	 *
+	 * The precise rules we use are:
+	 *
+	 * * Writers of s_groups_count *must* hold lock_super
+	 * AND
+	 * * Writers must perform a smp_wmb() after updating all dependent
+	 *   data and before modifying the groups count
+	 *
+	 * * Readers must hold lock_super() over the access
+	 * OR
+	 * * Readers must perform an smp_rmb() after reading the groups count
+	 *   and before reading any dependent data.
+	 *
+	 * NB. These rules can be relaxed when checking the group count
+	 * while freeing data, as we can only allocate from a block
+	 * group after serialising against the group count, and we can
+	 * only then free after serialising in turn against that
+	 * allocation.
+	 */
+	smp_wmb();
+
+	/* Update the global fs size fields */
+	sbi->s_groups_count++;
+
+	ext4_journal_dirty_metadata(handle, primary);
+
+	/* Update the reserved block counts only once the new group is
+	 * active. */
+	ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
+		input->reserved_blocks);
+
+	/* Update the free space counts */
+	percpu_counter_mod(&sbi->s_freeblocks_counter,
+			   input->free_blocks_count);
+	percpu_counter_mod(&sbi->s_freeinodes_counter,
+			   EXT4_INODES_PER_GROUP(sb));
+
+	ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+	sb->s_dirt = 1;
+
+exit_journal:
+	unlock_super(sb);
+	if ((err2 = ext4_journal_stop(handle)) && !err)
+		err = err2;
+	if (!err) {
+		update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
+			       sizeof(struct ext4_super_block));
+		update_backups(sb, primary->b_blocknr, primary->b_data,
+			       primary->b_size);
+	}
+exit_put:
+	iput(inode);
+	return err;
+} /* ext4_group_add */
+
+/* Extend the filesystem to the new number of blocks specified.  This entry
+ * point is only used to extend the current filesystem to the end of the last
+ * existing group.  It can be accessed via ioctl, or by "remount,resize=<size>"
+ * for emergencies (because it has no dependencies on reserved blocks).
+ *
+ * If we _really_ wanted, we could use default values to call ext4_group_add()
+ * allow the "remount" trick to work for arbitrary resizing, assuming enough
+ * GDT blocks are reserved to grow to the desired size.
+ */
+int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
+		      ext4_fsblk_t n_blocks_count)
+{
+	ext4_fsblk_t o_blocks_count;
+	unsigned long o_groups_count;
+	ext4_grpblk_t last;
+	ext4_grpblk_t add;
+	struct buffer_head * bh;
+	handle_t *handle;
+	int err;
+	unsigned long freed_blocks;
+
+	/* We don't need to worry about locking wrt other resizers just
+	 * yet: we're going to revalidate es->s_blocks_count after
+	 * taking lock_super() below. */
+	o_blocks_count = ext4_blocks_count(es);
+	o_groups_count = EXT4_SB(sb)->s_groups_count;
+
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n",
+		       o_blocks_count, n_blocks_count);
+
+	if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
+		return 0;
+
+	if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+		printk(KERN_ERR "EXT4-fs: filesystem on %s:"
+			" too large to resize to %llu blocks safely\n",
+			sb->s_id, n_blocks_count);
+		if (sizeof(sector_t) < 8)
+			ext4_warning(sb, __FUNCTION__,
+			"CONFIG_LBD not enabled\n");
+		return -EINVAL;
+	}
+
+	if (n_blocks_count < o_blocks_count) {
+		ext4_warning(sb, __FUNCTION__,
+			     "can't shrink FS - resize aborted");
+		return -EBUSY;
+	}
+
+	/* Handle the remaining blocks in the last group only. */
+	ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
+
+	if (last == 0) {
+		ext4_warning(sb, __FUNCTION__,
+			     "need to use ext2online to resize further");
+		return -EPERM;
+	}
+
+	add = EXT4_BLOCKS_PER_GROUP(sb) - last;
+
+	if (o_blocks_count + add < o_blocks_count) {
+		ext4_warning(sb, __FUNCTION__, "blocks_count overflow");
+		return -EINVAL;
+	}
+
+	if (o_blocks_count + add > n_blocks_count)
+		add = n_blocks_count - o_blocks_count;
+
+	if (o_blocks_count + add < n_blocks_count)
+		ext4_warning(sb, __FUNCTION__,
+			     "will only finish group (%llu"
+			     " blocks, %u new)",
+			     o_blocks_count + add, add);
+
+	/* See if the device is actually as big as what was requested */
+	bh = sb_bread(sb, o_blocks_count + add -1);
+	if (!bh) {
+		ext4_warning(sb, __FUNCTION__,
+			     "can't read last block, resize aborted");
+		return -ENOSPC;
+	}
+	brelse(bh);
+
+	/* We will update the superblock, one block bitmap, and
+	 * one group descriptor via ext4_free_blocks().
+	 */
+	handle = ext4_journal_start_sb(sb, 3);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		ext4_warning(sb, __FUNCTION__, "error %d on journal start",err);
+		goto exit_put;
+	}
+
+	lock_super(sb);
+	if (o_blocks_count != ext4_blocks_count(es)) {
+		ext4_warning(sb, __FUNCTION__,
+			     "multiple resizers run on filesystem!");
+		unlock_super(sb);
+		err = -EBUSY;
+		goto exit_put;
+	}
+
+	if ((err = ext4_journal_get_write_access(handle,
+						 EXT4_SB(sb)->s_sbh))) {
+		ext4_warning(sb, __FUNCTION__,
+			     "error %d on journal write access", err);
+		unlock_super(sb);
+		ext4_journal_stop(handle);
+		goto exit_put;
+	}
+	ext4_blocks_count_set(es, o_blocks_count + add);
+	ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+	sb->s_dirt = 1;
+	unlock_super(sb);
+	ext4_debug("freeing blocks %lu through %llu\n", o_blocks_count,
+		   o_blocks_count + add);
+	ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
+	ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
+		   o_blocks_count + add);
+	if ((err = ext4_journal_stop(handle)))
+		goto exit_put;
+	if (test_opt(sb, DEBUG))
+		printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
+		       ext4_blocks_count(es));
+	update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
+		       sizeof(struct ext4_super_block));
+exit_put:
+	return err;
+} /* ext4_group_extend */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
new file mode 100644
index 000000000000..b4b022aa2bc2
--- /dev/null
+++ b/fs/ext4/super.c
@@ -0,0 +1,2829 @@
+/*
+ *  linux/fs/ext4/super.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/smp_lock.h>
+#include <linux/buffer_head.h>
+#include <linux/vfs.h>
+#include <linux/random.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/seq_file.h>
+
+#include <asm/uaccess.h>
+
+#include "xattr.h"
+#include "acl.h"
+#include "namei.h"
+
+static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
+			     unsigned long journal_devnum);
+static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
+			       unsigned int);
+static void ext4_commit_super (struct super_block * sb,
+			       struct ext4_super_block * es,
+			       int sync);
+static void ext4_mark_recovery_complete(struct super_block * sb,
+					struct ext4_super_block * es);
+static void ext4_clear_journal_err(struct super_block * sb,
+				   struct ext4_super_block * es);
+static int ext4_sync_fs(struct super_block *sb, int wait);
+static const char *ext4_decode_error(struct super_block * sb, int errno,
+				     char nbuf[16]);
+static int ext4_remount (struct super_block * sb, int * flags, char * data);
+static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf);
+static void ext4_unlockfs(struct super_block *sb);
+static void ext4_write_super (struct super_block * sb);
+static void ext4_write_super_lockfs(struct super_block *sb);
+
+
+ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
+			       struct ext4_group_desc *bg)
+{
+	return le32_to_cpu(bg->bg_block_bitmap) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
+}
+
+ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
+			       struct ext4_group_desc *bg)
+{
+	return le32_to_cpu(bg->bg_inode_bitmap) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
+}
+
+ext4_fsblk_t ext4_inode_table(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le32_to_cpu(bg->bg_inode_table) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
+}
+
+void ext4_block_bitmap_set(struct super_block *sb,
+			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
+{
+	bg->bg_block_bitmap = cpu_to_le32((u32)blk);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
+}
+
+void ext4_inode_bitmap_set(struct super_block *sb,
+			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
+{
+	bg->bg_inode_bitmap  = cpu_to_le32((u32)blk);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
+}
+
+void ext4_inode_table_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, ext4_fsblk_t blk)
+{
+	bg->bg_inode_table = cpu_to_le32((u32)blk);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
+}
+
+/*
+ * Wrappers for jbd2_journal_start/end.
+ *
+ * The only special thing we need to do here is to make sure that all
+ * journal_end calls result in the superblock being marked dirty, so
+ * that sync() will call the filesystem's write_super callback if
+ * appropriate.
+ */
+handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
+{
+	journal_t *journal;
+
+	if (sb->s_flags & MS_RDONLY)
+		return ERR_PTR(-EROFS);
+
+	/* Special case here: if the journal has aborted behind our
+	 * backs (eg. EIO in the commit thread), then we still need to
+	 * take the FS itself readonly cleanly. */
+	journal = EXT4_SB(sb)->s_journal;
+	if (is_journal_aborted(journal)) {
+		ext4_abort(sb, __FUNCTION__,
+			   "Detected aborted journal");
+		return ERR_PTR(-EROFS);
+	}
+
+	return jbd2_journal_start(journal, nblocks);
+}
+
+/*
+ * The only special thing we need to do here is to make sure that all
+ * jbd2_journal_stop calls result in the superblock being marked dirty, so
+ * that sync() will call the filesystem's write_super callback if
+ * appropriate.
+ */
+int __ext4_journal_stop(const char *where, handle_t *handle)
+{
+	struct super_block *sb;
+	int err;
+	int rc;
+
+	sb = handle->h_transaction->t_journal->j_private;
+	err = handle->h_err;
+	rc = jbd2_journal_stop(handle);
+
+	if (!err)
+		err = rc;
+	if (err)
+		__ext4_std_error(sb, where, err);
+	return err;
+}
+
+void ext4_journal_abort_handle(const char *caller, const char *err_fn,
+		struct buffer_head *bh, handle_t *handle, int err)
+{
+	char nbuf[16];
+	const char *errstr = ext4_decode_error(NULL, err, nbuf);
+
+	if (bh)
+		BUFFER_TRACE(bh, "abort");
+
+	if (!handle->h_err)
+		handle->h_err = err;
+
+	if (is_handle_aborted(handle))
+		return;
+
+	printk(KERN_ERR "%s: aborting transaction: %s in %s\n",
+	       caller, errstr, err_fn);
+
+	jbd2_journal_abort_handle(handle);
+}
+
+/* Deal with the reporting of failure conditions on a filesystem such as
+ * inconsistencies detected or read IO failures.
+ *
+ * On ext2, we can store the error state of the filesystem in the
+ * superblock.  That is not possible on ext4, because we may have other
+ * write ordering constraints on the superblock which prevent us from
+ * writing it out straight away; and given that the journal is about to
+ * be aborted, we can't rely on the current, or future, transactions to
+ * write out the superblock safely.
+ *
+ * We'll just use the jbd2_journal_abort() error code to record an error in
+ * the journal instead.  On recovery, the journal will compain about
+ * that error until we've noted it down and cleared it.
+ */
+
+static void ext4_handle_error(struct super_block *sb)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+	es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	if (!test_opt (sb, ERRORS_CONT)) {
+		journal_t *journal = EXT4_SB(sb)->s_journal;
+
+		EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
+		if (journal)
+			jbd2_journal_abort(journal, -EIO);
+	}
+	if (test_opt (sb, ERRORS_RO)) {
+		printk (KERN_CRIT "Remounting filesystem read-only\n");
+		sb->s_flags |= MS_RDONLY;
+	}
+	ext4_commit_super(sb, es, 1);
+	if (test_opt(sb, ERRORS_PANIC))
+		panic("EXT4-fs (device %s): panic forced after error\n",
+			sb->s_id);
+}
+
+void ext4_error (struct super_block * sb, const char * function,
+		 const char * fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+
+	ext4_handle_error(sb);
+}
+
+static const char *ext4_decode_error(struct super_block * sb, int errno,
+				     char nbuf[16])
+{
+	char *errstr = NULL;
+
+	switch (errno) {
+	case -EIO:
+		errstr = "IO failure";
+		break;
+	case -ENOMEM:
+		errstr = "Out of memory";
+		break;
+	case -EROFS:
+		if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
+			errstr = "Journal has aborted";
+		else
+			errstr = "Readonly filesystem";
+		break;
+	default:
+		/* If the caller passed in an extra buffer for unknown
+		 * errors, textualise them now.  Else we just return
+		 * NULL. */
+		if (nbuf) {
+			/* Check for truncated error codes... */
+			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+				errstr = nbuf;
+		}
+		break;
+	}
+
+	return errstr;
+}
+
+/* __ext4_std_error decodes expected errors from journaling functions
+ * automatically and invokes the appropriate error response.  */
+
+void __ext4_std_error (struct super_block * sb, const char * function,
+		       int errno)
+{
+	char nbuf[16];
+	const char *errstr;
+
+	/* Special case: if the error is EROFS, and we're not already
+	 * inside a transaction, then there's really no point in logging
+	 * an error. */
+	if (errno == -EROFS && journal_current_handle() == NULL &&
+	    (sb->s_flags & MS_RDONLY))
+		return;
+
+	errstr = ext4_decode_error(sb, errno, nbuf);
+	printk (KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
+		sb->s_id, function, errstr);
+
+	ext4_handle_error(sb);
+}
+
+/*
+ * ext4_abort is a much stronger failure handler than ext4_error.  The
+ * abort function may be used to deal with unrecoverable failures such
+ * as journal IO errors or ENOMEM at a critical moment in log management.
+ *
+ * We unconditionally force the filesystem into an ABORT|READONLY state,
+ * unless the error response on the fs has been set to panic in which
+ * case we take the easy way out and panic immediately.
+ */
+
+void ext4_abort (struct super_block * sb, const char * function,
+		 const char * fmt, ...)
+{
+	va_list args;
+
+	printk (KERN_CRIT "ext4_abort called.\n");
+
+	va_start(args, fmt);
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+
+	if (test_opt(sb, ERRORS_PANIC))
+		panic("EXT4-fs panic from previous error\n");
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	printk(KERN_CRIT "Remounting filesystem read-only\n");
+	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+	sb->s_flags |= MS_RDONLY;
+	EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
+	jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+}
+
+void ext4_warning (struct super_block * sb, const char * function,
+		   const char * fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	printk(KERN_WARNING "EXT4-fs warning (device %s): %s: ",
+	       sb->s_id, function);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+}
+
+void ext4_update_dynamic_rev(struct super_block *sb)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+	if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
+		return;
+
+	ext4_warning(sb, __FUNCTION__,
+		     "updating to rev %d because of new feature flag, "
+		     "running e2fsck is recommended",
+		     EXT4_DYNAMIC_REV);
+
+	es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
+	es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
+	es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
+	/* leave es->s_feature_*compat flags alone */
+	/* es->s_uuid will be set by e2fsck if empty */
+
+	/*
+	 * The rest of the superblock fields should be zero, and if not it
+	 * means they are likely already in use, so leave them alone.  We
+	 * can leave it up to e2fsck to clean up any inconsistencies there.
+	 */
+}
+
+/*
+ * Open the external journal device
+ */
+static struct block_device *ext4_blkdev_get(dev_t dev)
+{
+	struct block_device *bdev;
+	char b[BDEVNAME_SIZE];
+
+	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+	if (IS_ERR(bdev))
+		goto fail;
+	return bdev;
+
+fail:
+	printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n",
+			__bdevname(dev, b), PTR_ERR(bdev));
+	return NULL;
+}
+
+/*
+ * Release the journal device
+ */
+static int ext4_blkdev_put(struct block_device *bdev)
+{
+	bd_release(bdev);
+	return blkdev_put(bdev);
+}
+
+static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
+{
+	struct block_device *bdev;
+	int ret = -ENODEV;
+
+	bdev = sbi->journal_bdev;
+	if (bdev) {
+		ret = ext4_blkdev_put(bdev);
+		sbi->journal_bdev = NULL;
+	}
+	return ret;
+}
+
+static inline struct inode *orphan_list_entry(struct list_head *l)
+{
+	return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
+}
+
+static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
+{
+	struct list_head *l;
+
+	printk(KERN_ERR "sb orphan head is %d\n",
+	       le32_to_cpu(sbi->s_es->s_last_orphan));
+
+	printk(KERN_ERR "sb_info orphan list:\n");
+	list_for_each(l, &sbi->s_orphan) {
+		struct inode *inode = orphan_list_entry(l);
+		printk(KERN_ERR "  "
+		       "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
+		       inode->i_sb->s_id, inode->i_ino, inode,
+		       inode->i_mode, inode->i_nlink,
+		       NEXT_ORPHAN(inode));
+	}
+}
+
+static void ext4_put_super (struct super_block * sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int i;
+
+	ext4_ext_release(sb);
+	ext4_xattr_put_super(sb);
+	jbd2_journal_destroy(sbi->s_journal);
+	if (!(sb->s_flags & MS_RDONLY)) {
+		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+		es->s_state = cpu_to_le16(sbi->s_mount_state);
+		BUFFER_TRACE(sbi->s_sbh, "marking dirty");
+		mark_buffer_dirty(sbi->s_sbh);
+		ext4_commit_super(sb, es, 1);
+	}
+
+	for (i = 0; i < sbi->s_gdb_count; i++)
+		brelse(sbi->s_group_desc[i]);
+	kfree(sbi->s_group_desc);
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+	brelse(sbi->s_sbh);
+#ifdef CONFIG_QUOTA
+	for (i = 0; i < MAXQUOTAS; i++)
+		kfree(sbi->s_qf_names[i]);
+#endif
+
+	/* Debugging code just in case the in-memory inode orphan list
+	 * isn't empty.  The on-disk one can be non-empty if we've
+	 * detected an error and taken the fs readonly, but the
+	 * in-memory list had better be clean by this point. */
+	if (!list_empty(&sbi->s_orphan))
+		dump_orphan_list(sb, sbi);
+	J_ASSERT(list_empty(&sbi->s_orphan));
+
+	invalidate_bdev(sb->s_bdev, 0);
+	if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
+		/*
+		 * Invalidate the journal device's buffers.  We don't want them
+		 * floating about in memory - the physical journal device may
+		 * hotswapped, and it breaks the `ro-after' testing code.
+		 */
+		sync_blockdev(sbi->journal_bdev);
+		invalidate_bdev(sbi->journal_bdev, 0);
+		ext4_blkdev_remove(sbi);
+	}
+	sb->s_fs_info = NULL;
+	kfree(sbi);
+	return;
+}
+
+static kmem_cache_t *ext4_inode_cachep;
+
+/*
+ * Called inside transaction, so use GFP_NOFS
+ */
+static struct inode *ext4_alloc_inode(struct super_block *sb)
+{
+	struct ext4_inode_info *ei;
+
+	ei = kmem_cache_alloc(ext4_inode_cachep, SLAB_NOFS);
+	if (!ei)
+		return NULL;
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+	ei->i_acl = EXT4_ACL_NOT_CACHED;
+	ei->i_default_acl = EXT4_ACL_NOT_CACHED;
+#endif
+	ei->i_block_alloc_info = NULL;
+	ei->vfs_inode.i_version = 1;
+	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
+	return &ei->vfs_inode;
+}
+
+static void ext4_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
+}
+
+static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+{
+	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR) {
+		INIT_LIST_HEAD(&ei->i_orphan);
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+		init_rwsem(&ei->xattr_sem);
+#endif
+		mutex_init(&ei->truncate_mutex);
+		inode_init_once(&ei->vfs_inode);
+	}
+}
+
+static int init_inodecache(void)
+{
+	ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
+					     sizeof(struct ext4_inode_info),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once, NULL);
+	if (ext4_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(ext4_inode_cachep);
+}
+
+static void ext4_clear_inode(struct inode *inode)
+{
+	struct ext4_block_alloc_info *rsv = EXT4_I(inode)->i_block_alloc_info;
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+	if (EXT4_I(inode)->i_acl &&
+			EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
+		posix_acl_release(EXT4_I(inode)->i_acl);
+		EXT4_I(inode)->i_acl = EXT4_ACL_NOT_CACHED;
+	}
+	if (EXT4_I(inode)->i_default_acl &&
+			EXT4_I(inode)->i_default_acl != EXT4_ACL_NOT_CACHED) {
+		posix_acl_release(EXT4_I(inode)->i_default_acl);
+		EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
+	}
+#endif
+	ext4_discard_reservation(inode);
+	EXT4_I(inode)->i_block_alloc_info = NULL;
+	if (unlikely(rsv))
+		kfree(rsv);
+}
+
+static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
+{
+#if defined(CONFIG_QUOTA)
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (sbi->s_jquota_fmt)
+		seq_printf(seq, ",jqfmt=%s",
+		(sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
+
+	if (sbi->s_qf_names[USRQUOTA])
+		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+
+	if (sbi->s_qf_names[GRPQUOTA])
+		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+
+	if (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA)
+		seq_puts(seq, ",usrquota");
+
+	if (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)
+		seq_puts(seq, ",grpquota");
+#endif
+}
+
+static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+	struct super_block *sb = vfs->mnt_sb;
+
+	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+		seq_puts(seq, ",data=journal");
+	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+		seq_puts(seq, ",data=ordered");
+	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+		seq_puts(seq, ",data=writeback");
+
+	ext4_show_quota_options(seq, sb);
+
+	return 0;
+}
+
+
+static struct dentry *ext4_get_dentry(struct super_block *sb, void *vobjp)
+{
+	__u32 *objp = vobjp;
+	unsigned long ino = objp[0];
+	__u32 generation = objp[1];
+	struct inode *inode;
+	struct dentry *result;
+
+	if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
+		return ERR_PTR(-ESTALE);
+	if (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
+		return ERR_PTR(-ESTALE);
+
+	/* iget isn't really right if the inode is currently unallocated!!
+	 *
+	 * ext4_read_inode will return a bad_inode if the inode had been
+	 * deleted, so we should be safe.
+	 *
+	 * Currently we don't know the generation for parent directory, so
+	 * a generation of 0 means "accept any"
+	 */
+	inode = iget(sb, ino);
+	if (inode == NULL)
+		return ERR_PTR(-ENOMEM);
+	if (is_bad_inode(inode) ||
+	    (generation && inode->i_generation != generation)) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	/* now to find a dentry.
+	 * If possible, get a well-connected one
+	 */
+	result = d_alloc_anon(inode);
+	if (!result) {
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+	return result;
+}
+
+#ifdef CONFIG_QUOTA
+#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
+#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
+
+static int ext4_dquot_initialize(struct inode *inode, int type);
+static int ext4_dquot_drop(struct inode *inode);
+static int ext4_write_dquot(struct dquot *dquot);
+static int ext4_acquire_dquot(struct dquot *dquot);
+static int ext4_release_dquot(struct dquot *dquot);
+static int ext4_mark_dquot_dirty(struct dquot *dquot);
+static int ext4_write_info(struct super_block *sb, int type);
+static int ext4_quota_on(struct super_block *sb, int type, int format_id, char *path);
+static int ext4_quota_on_mount(struct super_block *sb, int type);
+static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
+			       size_t len, loff_t off);
+static ssize_t ext4_quota_write(struct super_block *sb, int type,
+				const char *data, size_t len, loff_t off);
+
+static struct dquot_operations ext4_quota_operations = {
+	.initialize	= ext4_dquot_initialize,
+	.drop		= ext4_dquot_drop,
+	.alloc_space	= dquot_alloc_space,
+	.alloc_inode	= dquot_alloc_inode,
+	.free_space	= dquot_free_space,
+	.free_inode	= dquot_free_inode,
+	.transfer	= dquot_transfer,
+	.write_dquot	= ext4_write_dquot,
+	.acquire_dquot	= ext4_acquire_dquot,
+	.release_dquot	= ext4_release_dquot,
+	.mark_dirty	= ext4_mark_dquot_dirty,
+	.write_info	= ext4_write_info
+};
+
+static struct quotactl_ops ext4_qctl_operations = {
+	.quota_on	= ext4_quota_on,
+	.quota_off	= vfs_quota_off,
+	.quota_sync	= vfs_quota_sync,
+	.get_info	= vfs_get_dqinfo,
+	.set_info	= vfs_set_dqinfo,
+	.get_dqblk	= vfs_get_dqblk,
+	.set_dqblk	= vfs_set_dqblk
+};
+#endif
+
+static struct super_operations ext4_sops = {
+	.alloc_inode	= ext4_alloc_inode,
+	.destroy_inode	= ext4_destroy_inode,
+	.read_inode	= ext4_read_inode,
+	.write_inode	= ext4_write_inode,
+	.dirty_inode	= ext4_dirty_inode,
+	.delete_inode	= ext4_delete_inode,
+	.put_super	= ext4_put_super,
+	.write_super	= ext4_write_super,
+	.sync_fs	= ext4_sync_fs,
+	.write_super_lockfs = ext4_write_super_lockfs,
+	.unlockfs	= ext4_unlockfs,
+	.statfs		= ext4_statfs,
+	.remount_fs	= ext4_remount,
+	.clear_inode	= ext4_clear_inode,
+	.show_options	= ext4_show_options,
+#ifdef CONFIG_QUOTA
+	.quota_read	= ext4_quota_read,
+	.quota_write	= ext4_quota_write,
+#endif
+};
+
+static struct export_operations ext4_export_ops = {
+	.get_parent = ext4_get_parent,
+	.get_dentry = ext4_get_dentry,
+};
+
+enum {
+	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
+	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
+	Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
+	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
+	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
+	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
+	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
+	Opt_grpquota, Opt_extents,
+};
+
+static match_table_t tokens = {
+	{Opt_bsd_df, "bsddf"},
+	{Opt_minix_df, "minixdf"},
+	{Opt_grpid, "grpid"},
+	{Opt_grpid, "bsdgroups"},
+	{Opt_nogrpid, "nogrpid"},
+	{Opt_nogrpid, "sysvgroups"},
+	{Opt_resgid, "resgid=%u"},
+	{Opt_resuid, "resuid=%u"},
+	{Opt_sb, "sb=%u"},
+	{Opt_err_cont, "errors=continue"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_nouid32, "nouid32"},
+	{Opt_nocheck, "nocheck"},
+	{Opt_nocheck, "check=none"},
+	{Opt_debug, "debug"},
+	{Opt_oldalloc, "oldalloc"},
+	{Opt_orlov, "orlov"},
+	{Opt_user_xattr, "user_xattr"},
+	{Opt_nouser_xattr, "nouser_xattr"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_reservation, "reservation"},
+	{Opt_noreservation, "noreservation"},
+	{Opt_noload, "noload"},
+	{Opt_nobh, "nobh"},
+	{Opt_bh, "bh"},
+	{Opt_commit, "commit=%u"},
+	{Opt_journal_update, "journal=update"},
+	{Opt_journal_inum, "journal=%u"},
+	{Opt_journal_dev, "journal_dev=%u"},
+	{Opt_abort, "abort"},
+	{Opt_data_journal, "data=journal"},
+	{Opt_data_ordered, "data=ordered"},
+	{Opt_data_writeback, "data=writeback"},
+	{Opt_offusrjquota, "usrjquota="},
+	{Opt_usrjquota, "usrjquota=%s"},
+	{Opt_offgrpjquota, "grpjquota="},
+	{Opt_grpjquota, "grpjquota=%s"},
+	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
+	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
+	{Opt_grpquota, "grpquota"},
+	{Opt_noquota, "noquota"},
+	{Opt_quota, "quota"},
+	{Opt_usrquota, "usrquota"},
+	{Opt_barrier, "barrier=%u"},
+	{Opt_extents, "extents"},
+	{Opt_err, NULL},
+	{Opt_resize, "resize"},
+};
+
+static ext4_fsblk_t get_sb_block(void **data)
+{
+	ext4_fsblk_t	sb_block;
+	char		*options = (char *) *data;
+
+	if (!options || strncmp(options, "sb=", 3) != 0)
+		return 1;	/* Default location */
+	options += 3;
+	/*todo: use simple_strtoll with >32bit ext4 */
+	sb_block = simple_strtoul(options, &options, 0);
+	if (*options && *options != ',') {
+		printk("EXT4-fs: Invalid sb specification: %s\n",
+		       (char *) *data);
+		return 1;
+	}
+	if (*options == ',')
+		options++;
+	*data = (void *) options;
+	return sb_block;
+}
+
+static int parse_options (char *options, struct super_block *sb,
+			  unsigned int *inum, unsigned long *journal_devnum,
+			  ext4_fsblk_t *n_blocks_count, int is_remount)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	char * p;
+	substring_t args[MAX_OPT_ARGS];
+	int data_opt = 0;
+	int option;
+#ifdef CONFIG_QUOTA
+	int qtype;
+	char *qname;
+#endif
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep (&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_bsd_df:
+			clear_opt (sbi->s_mount_opt, MINIX_DF);
+			break;
+		case Opt_minix_df:
+			set_opt (sbi->s_mount_opt, MINIX_DF);
+			break;
+		case Opt_grpid:
+			set_opt (sbi->s_mount_opt, GRPID);
+			break;
+		case Opt_nogrpid:
+			clear_opt (sbi->s_mount_opt, GRPID);
+			break;
+		case Opt_resuid:
+			if (match_int(&args[0], &option))
+				return 0;
+			sbi->s_resuid = option;
+			break;
+		case Opt_resgid:
+			if (match_int(&args[0], &option))
+				return 0;
+			sbi->s_resgid = option;
+			break;
+		case Opt_sb:
+			/* handled by get_sb_block() instead of here */
+			/* *sb_block = match_int(&args[0]); */
+			break;
+		case Opt_err_panic:
+			clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt (sbi->s_mount_opt, ERRORS_RO);
+			set_opt (sbi->s_mount_opt, ERRORS_PANIC);
+			break;
+		case Opt_err_ro:
+			clear_opt (sbi->s_mount_opt, ERRORS_CONT);
+			clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+			set_opt (sbi->s_mount_opt, ERRORS_RO);
+			break;
+		case Opt_err_cont:
+			clear_opt (sbi->s_mount_opt, ERRORS_RO);
+			clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
+			set_opt (sbi->s_mount_opt, ERRORS_CONT);
+			break;
+		case Opt_nouid32:
+			set_opt (sbi->s_mount_opt, NO_UID32);
+			break;
+		case Opt_nocheck:
+			clear_opt (sbi->s_mount_opt, CHECK);
+			break;
+		case Opt_debug:
+			set_opt (sbi->s_mount_opt, DEBUG);
+			break;
+		case Opt_oldalloc:
+			set_opt (sbi->s_mount_opt, OLDALLOC);
+			break;
+		case Opt_orlov:
+			clear_opt (sbi->s_mount_opt, OLDALLOC);
+			break;
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+		case Opt_user_xattr:
+			set_opt (sbi->s_mount_opt, XATTR_USER);
+			break;
+		case Opt_nouser_xattr:
+			clear_opt (sbi->s_mount_opt, XATTR_USER);
+			break;
+#else
+		case Opt_user_xattr:
+		case Opt_nouser_xattr:
+			printk("EXT4 (no)user_xattr options not supported\n");
+			break;
+#endif
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+		case Opt_acl:
+			set_opt(sbi->s_mount_opt, POSIX_ACL);
+			break;
+		case Opt_noacl:
+			clear_opt(sbi->s_mount_opt, POSIX_ACL);
+			break;
+#else
+		case Opt_acl:
+		case Opt_noacl:
+			printk("EXT4 (no)acl options not supported\n");
+			break;
+#endif
+		case Opt_reservation:
+			set_opt(sbi->s_mount_opt, RESERVATION);
+			break;
+		case Opt_noreservation:
+			clear_opt(sbi->s_mount_opt, RESERVATION);
+			break;
+		case Opt_journal_update:
+			/* @@@ FIXME */
+			/* Eventually we will want to be able to create
+			   a journal file here.  For now, only allow the
+			   user to specify an existing inode to be the
+			   journal file. */
+			if (is_remount) {
+				printk(KERN_ERR "EXT4-fs: cannot specify "
+				       "journal on remount\n");
+				return 0;
+			}
+			set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
+			break;
+		case Opt_journal_inum:
+			if (is_remount) {
+				printk(KERN_ERR "EXT4-fs: cannot specify "
+				       "journal on remount\n");
+				return 0;
+			}
+			if (match_int(&args[0], &option))
+				return 0;
+			*inum = option;
+			break;
+		case Opt_journal_dev:
+			if (is_remount) {
+				printk(KERN_ERR "EXT4-fs: cannot specify "
+				       "journal on remount\n");
+				return 0;
+			}
+			if (match_int(&args[0], &option))
+				return 0;
+			*journal_devnum = option;
+			break;
+		case Opt_noload:
+			set_opt (sbi->s_mount_opt, NOLOAD);
+			break;
+		case Opt_commit:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			if (option == 0)
+				option = JBD_DEFAULT_MAX_COMMIT_AGE;
+			sbi->s_commit_interval = HZ * option;
+			break;
+		case Opt_data_journal:
+			data_opt = EXT4_MOUNT_JOURNAL_DATA;
+			goto datacheck;
+		case Opt_data_ordered:
+			data_opt = EXT4_MOUNT_ORDERED_DATA;
+			goto datacheck;
+		case Opt_data_writeback:
+			data_opt = EXT4_MOUNT_WRITEBACK_DATA;
+		datacheck:
+			if (is_remount) {
+				if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
+						!= data_opt) {
+					printk(KERN_ERR
+						"EXT4-fs: cannot change data "
+						"mode on remount\n");
+					return 0;
+				}
+			} else {
+				sbi->s_mount_opt &= ~EXT4_MOUNT_DATA_FLAGS;
+				sbi->s_mount_opt |= data_opt;
+			}
+			break;
+#ifdef CONFIG_QUOTA
+		case Opt_usrjquota:
+			qtype = USRQUOTA;
+			goto set_qf_name;
+		case Opt_grpjquota:
+			qtype = GRPQUOTA;
+set_qf_name:
+			if (sb_any_quota_enabled(sb)) {
+				printk(KERN_ERR
+					"EXT4-fs: Cannot change journalled "
+					"quota options when quota turned on.\n");
+				return 0;
+			}
+			qname = match_strdup(&args[0]);
+			if (!qname) {
+				printk(KERN_ERR
+					"EXT4-fs: not enough memory for "
+					"storing quotafile name.\n");
+				return 0;
+			}
+			if (sbi->s_qf_names[qtype] &&
+			    strcmp(sbi->s_qf_names[qtype], qname)) {
+				printk(KERN_ERR
+					"EXT4-fs: %s quota file already "
+					"specified.\n", QTYPE2NAME(qtype));
+				kfree(qname);
+				return 0;
+			}
+			sbi->s_qf_names[qtype] = qname;
+			if (strchr(sbi->s_qf_names[qtype], '/')) {
+				printk(KERN_ERR
+					"EXT4-fs: quotafile must be on "
+					"filesystem root.\n");
+				kfree(sbi->s_qf_names[qtype]);
+				sbi->s_qf_names[qtype] = NULL;
+				return 0;
+			}
+			set_opt(sbi->s_mount_opt, QUOTA);
+			break;
+		case Opt_offusrjquota:
+			qtype = USRQUOTA;
+			goto clear_qf_name;
+		case Opt_offgrpjquota:
+			qtype = GRPQUOTA;
+clear_qf_name:
+			if (sb_any_quota_enabled(sb)) {
+				printk(KERN_ERR "EXT4-fs: Cannot change "
+					"journalled quota options when "
+					"quota turned on.\n");
+				return 0;
+			}
+			/*
+			 * The space will be released later when all options
+			 * are confirmed to be correct
+			 */
+			sbi->s_qf_names[qtype] = NULL;
+			break;
+		case Opt_jqfmt_vfsold:
+			sbi->s_jquota_fmt = QFMT_VFS_OLD;
+			break;
+		case Opt_jqfmt_vfsv0:
+			sbi->s_jquota_fmt = QFMT_VFS_V0;
+			break;
+		case Opt_quota:
+		case Opt_usrquota:
+			set_opt(sbi->s_mount_opt, QUOTA);
+			set_opt(sbi->s_mount_opt, USRQUOTA);
+			break;
+		case Opt_grpquota:
+			set_opt(sbi->s_mount_opt, QUOTA);
+			set_opt(sbi->s_mount_opt, GRPQUOTA);
+			break;
+		case Opt_noquota:
+			if (sb_any_quota_enabled(sb)) {
+				printk(KERN_ERR "EXT4-fs: Cannot change quota "
+					"options when quota turned on.\n");
+				return 0;
+			}
+			clear_opt(sbi->s_mount_opt, QUOTA);
+			clear_opt(sbi->s_mount_opt, USRQUOTA);
+			clear_opt(sbi->s_mount_opt, GRPQUOTA);
+			break;
+#else
+		case Opt_quota:
+		case Opt_usrquota:
+		case Opt_grpquota:
+		case Opt_usrjquota:
+		case Opt_grpjquota:
+		case Opt_offusrjquota:
+		case Opt_offgrpjquota:
+		case Opt_jqfmt_vfsold:
+		case Opt_jqfmt_vfsv0:
+			printk(KERN_ERR
+				"EXT4-fs: journalled quota options not "
+				"supported.\n");
+			break;
+		case Opt_noquota:
+			break;
+#endif
+		case Opt_abort:
+			set_opt(sbi->s_mount_opt, ABORT);
+			break;
+		case Opt_barrier:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option)
+				set_opt(sbi->s_mount_opt, BARRIER);
+			else
+				clear_opt(sbi->s_mount_opt, BARRIER);
+			break;
+		case Opt_ignore:
+			break;
+		case Opt_resize:
+			if (!is_remount) {
+				printk("EXT4-fs: resize option only available "
+					"for remount\n");
+				return 0;
+			}
+			if (match_int(&args[0], &option) != 0)
+				return 0;
+			*n_blocks_count = option;
+			break;
+		case Opt_nobh:
+			set_opt(sbi->s_mount_opt, NOBH);
+			break;
+		case Opt_bh:
+			clear_opt(sbi->s_mount_opt, NOBH);
+			break;
+		case Opt_extents:
+			set_opt (sbi->s_mount_opt, EXTENTS);
+			break;
+		default:
+			printk (KERN_ERR
+				"EXT4-fs: Unrecognized mount option \"%s\" "
+				"or missing value\n", p);
+			return 0;
+		}
+	}
+#ifdef CONFIG_QUOTA
+	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
+		if ((sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA) &&
+		     sbi->s_qf_names[USRQUOTA])
+			clear_opt(sbi->s_mount_opt, USRQUOTA);
+
+		if ((sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA) &&
+		     sbi->s_qf_names[GRPQUOTA])
+			clear_opt(sbi->s_mount_opt, GRPQUOTA);
+
+		if ((sbi->s_qf_names[USRQUOTA] &&
+				(sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
+		    (sbi->s_qf_names[GRPQUOTA] &&
+				(sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
+			printk(KERN_ERR "EXT4-fs: old and new quota "
+					"format mixing.\n");
+			return 0;
+		}
+
+		if (!sbi->s_jquota_fmt) {
+			printk(KERN_ERR "EXT4-fs: journalled quota format "
+					"not specified.\n");
+			return 0;
+		}
+	} else {
+		if (sbi->s_jquota_fmt) {
+			printk(KERN_ERR "EXT4-fs: journalled quota format "
+					"specified with no journalling "
+					"enabled.\n");
+			return 0;
+		}
+	}
+#endif
+	return 1;
+}
+
+static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
+			    int read_only)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	int res = 0;
+
+	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
+		printk (KERN_ERR "EXT4-fs warning: revision level too high, "
+			"forcing read-only mode\n");
+		res = MS_RDONLY;
+	}
+	if (read_only)
+		return res;
+	if (!(sbi->s_mount_state & EXT4_VALID_FS))
+		printk (KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
+			"running e2fsck is recommended\n");
+	else if ((sbi->s_mount_state & EXT4_ERROR_FS))
+		printk (KERN_WARNING
+			"EXT4-fs warning: mounting fs with errors, "
+			"running e2fsck is recommended\n");
+	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+		 le16_to_cpu(es->s_mnt_count) >=
+		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
+		printk (KERN_WARNING
+			"EXT4-fs warning: maximal mount count reached, "
+			"running e2fsck is recommended\n");
+	else if (le32_to_cpu(es->s_checkinterval) &&
+		(le32_to_cpu(es->s_lastcheck) +
+			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
+		printk (KERN_WARNING
+			"EXT4-fs warning: checktime reached, "
+			"running e2fsck is recommended\n");
+#if 0
+		/* @@@ We _will_ want to clear the valid bit if we find
+		 * inconsistencies, to force a fsck at reboot.  But for
+		 * a plain journaled filesystem we can keep it set as
+		 * valid forever! :)
+		 */
+	es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT4_VALID_FS);
+#endif
+	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
+		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
+	es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1);
+	es->s_mtime = cpu_to_le32(get_seconds());
+	ext4_update_dynamic_rev(sb);
+	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+
+	ext4_commit_super(sb, es, 1);
+	if (test_opt(sb, DEBUG))
+		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, "
+				"bpg=%lu, ipg=%lu, mo=%04lx]\n",
+			sb->s_blocksize,
+			sbi->s_groups_count,
+			EXT4_BLOCKS_PER_GROUP(sb),
+			EXT4_INODES_PER_GROUP(sb),
+			sbi->s_mount_opt);
+
+	printk(KERN_INFO "EXT4 FS on %s, ", sb->s_id);
+	if (EXT4_SB(sb)->s_journal->j_inode == NULL) {
+		char b[BDEVNAME_SIZE];
+
+		printk("external journal on %s\n",
+			bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
+	} else {
+		printk("internal journal\n");
+	}
+	return res;
+}
+
+/* Called at mount-time, super-block is locked */
+static int ext4_check_descriptors (struct super_block * sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
+	ext4_fsblk_t last_block;
+	ext4_fsblk_t block_bitmap;
+	ext4_fsblk_t inode_bitmap;
+	ext4_fsblk_t inode_table;
+	struct ext4_group_desc * gdp = NULL;
+	int desc_block = 0;
+	int i;
+
+	ext4_debug ("Checking group descriptors");
+
+	for (i = 0; i < sbi->s_groups_count; i++)
+	{
+		if (i == sbi->s_groups_count - 1)
+			last_block = ext4_blocks_count(sbi->s_es) - 1;
+		else
+			last_block = first_block +
+				(EXT4_BLOCKS_PER_GROUP(sb) - 1);
+
+		if ((i % EXT4_DESC_PER_BLOCK(sb)) == 0)
+			gdp = (struct ext4_group_desc *)
+					sbi->s_group_desc[desc_block++]->b_data;
+		block_bitmap = ext4_block_bitmap(sb, gdp);
+		if (block_bitmap < first_block || block_bitmap > last_block)
+		{
+			ext4_error (sb, "ext4_check_descriptors",
+				    "Block bitmap for group %d"
+				    " not in group (block %llu)!",
+				    i, block_bitmap);
+			return 0;
+		}
+		inode_bitmap = ext4_inode_bitmap(sb, gdp);
+		if (inode_bitmap < first_block || inode_bitmap > last_block)
+		{
+			ext4_error (sb, "ext4_check_descriptors",
+				    "Inode bitmap for group %d"
+				    " not in group (block %llu)!",
+				    i, inode_bitmap);
+			return 0;
+		}
+		inode_table = ext4_inode_table(sb, gdp);
+		if (inode_table < first_block ||
+		    inode_table + sbi->s_itb_per_group > last_block)
+		{
+			ext4_error (sb, "ext4_check_descriptors",
+				    "Inode table for group %d"
+				    " not in group (block %llu)!",
+				    i, inode_table);
+			return 0;
+		}
+		first_block += EXT4_BLOCKS_PER_GROUP(sb);
+		gdp = (struct ext4_group_desc *)
+			((__u8 *)gdp + EXT4_DESC_SIZE(sb));
+	}
+
+	ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
+	sbi->s_es->s_free_inodes_count=cpu_to_le32(ext4_count_free_inodes(sb));
+	return 1;
+}
+
+
+/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
+ * the superblock) which were deleted from all directories, but held open by
+ * a process at the time of a crash.  We walk the list and try to delete these
+ * inodes at recovery time (only with a read-write filesystem).
+ *
+ * In order to keep the orphan inode chain consistent during traversal (in
+ * case of crash during recovery), we link each inode into the superblock
+ * orphan list_head and handle it the same way as an inode deletion during
+ * normal operation (which journals the operations for us).
+ *
+ * We only do an iget() and an iput() on each inode, which is very safe if we
+ * accidentally point at an in-use or already deleted inode.  The worst that
+ * can happen in this case is that we get a "bit already cleared" message from
+ * ext4_free_inode().  The only reason we would point at a wrong inode is if
+ * e2fsck was run on this filesystem, and it must have already done the orphan
+ * inode cleanup for us, so we can safely abort without any further action.
+ */
+static void ext4_orphan_cleanup (struct super_block * sb,
+				 struct ext4_super_block * es)
+{
+	unsigned int s_flags = sb->s_flags;
+	int nr_orphans = 0, nr_truncates = 0;
+#ifdef CONFIG_QUOTA
+	int i;
+#endif
+	if (!es->s_last_orphan) {
+		jbd_debug(4, "no orphan inodes to clean up\n");
+		return;
+	}
+
+	if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
+		if (es->s_last_orphan)
+			jbd_debug(1, "Errors on filesystem, "
+				  "clearing orphan list.\n");
+		es->s_last_orphan = 0;
+		jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+		return;
+	}
+
+	if (s_flags & MS_RDONLY) {
+		printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n",
+		       sb->s_id);
+		sb->s_flags &= ~MS_RDONLY;
+	}
+#ifdef CONFIG_QUOTA
+	/* Needed for iput() to work correctly and not trash data */
+	sb->s_flags |= MS_ACTIVE;
+	/* Turn on quotas so that they are updated correctly */
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (EXT4_SB(sb)->s_qf_names[i]) {
+			int ret = ext4_quota_on_mount(sb, i);
+			if (ret < 0)
+				printk(KERN_ERR
+					"EXT4-fs: Cannot turn on journalled "
+					"quota: error %d\n", ret);
+		}
+	}
+#endif
+
+	while (es->s_last_orphan) {
+		struct inode *inode;
+
+		if (!(inode =
+		      ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) {
+			es->s_last_orphan = 0;
+			break;
+		}
+
+		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+		DQUOT_INIT(inode);
+		if (inode->i_nlink) {
+			printk(KERN_DEBUG
+				"%s: truncating inode %lu to %Ld bytes\n",
+				__FUNCTION__, inode->i_ino, inode->i_size);
+			jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
+				  inode->i_ino, inode->i_size);
+			ext4_truncate(inode);
+			nr_truncates++;
+		} else {
+			printk(KERN_DEBUG
+				"%s: deleting unreferenced inode %lu\n",
+				__FUNCTION__, inode->i_ino);
+			jbd_debug(2, "deleting unreferenced inode %lu\n",
+				  inode->i_ino);
+			nr_orphans++;
+		}
+		iput(inode);  /* The delete magic happens here! */
+	}
+
+#define PLURAL(x) (x), ((x)==1) ? "" : "s"
+
+	if (nr_orphans)
+		printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
+		       sb->s_id, PLURAL(nr_orphans));
+	if (nr_truncates)
+		printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n",
+		       sb->s_id, PLURAL(nr_truncates));
+#ifdef CONFIG_QUOTA
+	/* Turn quotas off */
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (sb_dqopt(sb)->files[i])
+			vfs_quota_off(sb, i);
+	}
+#endif
+	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+}
+
+#define log2(n) ffz(~(n))
+
+/*
+ * Maximal file size.  There is a direct, and {,double-,triple-}indirect
+ * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
+ * We need to be 1 filesystem block less than the 2^32 sector limit.
+ */
+static loff_t ext4_max_size(int bits)
+{
+	loff_t res = EXT4_NDIR_BLOCKS;
+	/* This constant is calculated to be the largest file size for a
+	 * dense, 4k-blocksize file such that the total number of
+	 * sectors in the file, including data and all indirect blocks,
+	 * does not exceed 2^32. */
+	const loff_t upper_limit = 0x1ff7fffd000LL;
+
+	res += 1LL << (bits-2);
+	res += 1LL << (2*(bits-2));
+	res += 1LL << (3*(bits-2));
+	res <<= bits;
+	if (res > upper_limit)
+		res = upper_limit;
+	return res;
+}
+
+static ext4_fsblk_t descriptor_loc(struct super_block *sb,
+				ext4_fsblk_t logical_sb_block, int nr)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	unsigned long bg, first_meta_bg;
+	int has_super = 0;
+
+	first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
+
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+	    nr < first_meta_bg)
+		return logical_sb_block + nr + 1;
+	bg = sbi->s_desc_per_block * nr;
+	if (ext4_bg_has_super(sb, bg))
+		has_super = 1;
+	return (has_super + ext4_group_first_block_no(sb, bg));
+}
+
+
+static int ext4_fill_super (struct super_block *sb, void *data, int silent)
+{
+	struct buffer_head * bh;
+	struct ext4_super_block *es = NULL;
+	struct ext4_sb_info *sbi;
+	ext4_fsblk_t block;
+	ext4_fsblk_t sb_block = get_sb_block(&data);
+	ext4_fsblk_t logical_sb_block;
+	unsigned long offset = 0;
+	unsigned int journal_inum = 0;
+	unsigned long journal_devnum = 0;
+	unsigned long def_mount_opts;
+	struct inode *root;
+	int blocksize;
+	int hblock;
+	int db_count;
+	int i;
+	int needs_recovery;
+	__le32 features;
+	__u64 blocks_count;
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	sb->s_fs_info = sbi;
+	sbi->s_mount_opt = 0;
+	sbi->s_resuid = EXT4_DEF_RESUID;
+	sbi->s_resgid = EXT4_DEF_RESGID;
+
+	unlock_kernel();
+
+	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
+	if (!blocksize) {
+		printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
+		goto out_fail;
+	}
+
+	/*
+	 * The ext4 superblock will not be buffer aligned for other than 1kB
+	 * block sizes.  We need to calculate the offset from buffer start.
+	 */
+	if (blocksize != EXT4_MIN_BLOCK_SIZE) {
+		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
+		offset = do_div(logical_sb_block, blocksize);
+	} else {
+		logical_sb_block = sb_block;
+	}
+
+	if (!(bh = sb_bread(sb, logical_sb_block))) {
+		printk (KERN_ERR "EXT4-fs: unable to read superblock\n");
+		goto out_fail;
+	}
+	/*
+	 * Note: s_es must be initialized as soon as possible because
+	 *       some ext4 macro-instructions depend on its value
+	 */
+	es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+	sbi->s_es = es;
+	sb->s_magic = le16_to_cpu(es->s_magic);
+	if (sb->s_magic != EXT4_SUPER_MAGIC)
+		goto cantfind_ext4;
+
+	/* Set defaults before we parse the mount options */
+	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+	if (def_mount_opts & EXT4_DEFM_DEBUG)
+		set_opt(sbi->s_mount_opt, DEBUG);
+	if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
+		set_opt(sbi->s_mount_opt, GRPID);
+	if (def_mount_opts & EXT4_DEFM_UID16)
+		set_opt(sbi->s_mount_opt, NO_UID32);
+	if (def_mount_opts & EXT4_DEFM_XATTR_USER)
+		set_opt(sbi->s_mount_opt, XATTR_USER);
+	if (def_mount_opts & EXT4_DEFM_ACL)
+		set_opt(sbi->s_mount_opt, POSIX_ACL);
+	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
+		sbi->s_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
+	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
+		sbi->s_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
+	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
+		sbi->s_mount_opt |= EXT4_MOUNT_WRITEBACK_DATA;
+
+	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
+		set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_RO)
+		set_opt(sbi->s_mount_opt, ERRORS_RO);
+	else
+		set_opt(sbi->s_mount_opt, ERRORS_CONT);
+
+	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
+	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+
+	set_opt(sbi->s_mount_opt, RESERVATION);
+
+	if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
+			    NULL, 0))
+		goto failed_mount;
+
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+
+	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
+	    (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
+	     EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
+	     EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
+		printk(KERN_WARNING
+		       "EXT4-fs warning: feature flags set on rev 0 fs, "
+		       "running e2fsck is recommended\n");
+	/*
+	 * Check feature flags regardless of the revision level, since we
+	 * previously didn't change the revision level when setting the flags,
+	 * so there is a chance incompat flags are set on a rev 0 filesystem.
+	 */
+	features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
+	if (features) {
+		printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
+		       "unsupported optional features (%x).\n",
+		       sb->s_id, le32_to_cpu(features));
+		goto failed_mount;
+	}
+	features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
+	if (!(sb->s_flags & MS_RDONLY) && features) {
+		printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
+		       "unsupported optional features (%x).\n",
+		       sb->s_id, le32_to_cpu(features));
+		goto failed_mount;
+	}
+	blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+
+	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
+	    blocksize > EXT4_MAX_BLOCK_SIZE) {
+		printk(KERN_ERR
+		       "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n",
+		       blocksize, sb->s_id);
+		goto failed_mount;
+	}
+
+	hblock = bdev_hardsect_size(sb->s_bdev);
+	if (sb->s_blocksize != blocksize) {
+		/*
+		 * Make sure the blocksize for the filesystem is larger
+		 * than the hardware sectorsize for the machine.
+		 */
+		if (blocksize < hblock) {
+			printk(KERN_ERR "EXT4-fs: blocksize %d too small for "
+			       "device blocksize %d.\n", blocksize, hblock);
+			goto failed_mount;
+		}
+
+		brelse (bh);
+		sb_set_blocksize(sb, blocksize);
+		logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
+		offset = do_div(logical_sb_block, blocksize);
+		bh = sb_bread(sb, logical_sb_block);
+		if (!bh) {
+			printk(KERN_ERR
+			       "EXT4-fs: Can't read superblock on 2nd try.\n");
+			goto failed_mount;
+		}
+		es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
+		sbi->s_es = es;
+		if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
+			printk (KERN_ERR
+				"EXT4-fs: Magic mismatch, very weird !\n");
+			goto failed_mount;
+		}
+	}
+
+	sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
+
+	if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
+		sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
+		sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
+	} else {
+		sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
+		sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
+		if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
+		    (sbi->s_inode_size & (sbi->s_inode_size - 1)) ||
+		    (sbi->s_inode_size > blocksize)) {
+			printk (KERN_ERR
+				"EXT4-fs: unsupported inode size: %d\n",
+				sbi->s_inode_size);
+			goto failed_mount;
+		}
+	}
+	sbi->s_frag_size = EXT4_MIN_FRAG_SIZE <<
+				   le32_to_cpu(es->s_log_frag_size);
+	if (blocksize != sbi->s_frag_size) {
+		printk(KERN_ERR
+		       "EXT4-fs: fragsize %lu != blocksize %u (unsupported)\n",
+		       sbi->s_frag_size, blocksize);
+		goto failed_mount;
+	}
+	sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
+		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
+		    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
+		    sbi->s_desc_size & (sbi->s_desc_size - 1)) {
+			printk(KERN_ERR
+			       "EXT4-fs: unsupported descriptor size %lu\n",
+			       sbi->s_desc_size);
+			goto failed_mount;
+		}
+	} else
+		sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
+	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
+	sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
+	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
+	if (EXT4_INODE_SIZE(sb) == 0)
+		goto cantfind_ext4;
+	sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
+	if (sbi->s_inodes_per_block == 0)
+		goto cantfind_ext4;
+	sbi->s_itb_per_group = sbi->s_inodes_per_group /
+					sbi->s_inodes_per_block;
+	sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
+	sbi->s_sbh = bh;
+	sbi->s_mount_state = le16_to_cpu(es->s_state);
+	sbi->s_addr_per_block_bits = log2(EXT4_ADDR_PER_BLOCK(sb));
+	sbi->s_desc_per_block_bits = log2(EXT4_DESC_PER_BLOCK(sb));
+	for (i=0; i < 4; i++)
+		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
+	sbi->s_def_hash_version = es->s_def_hash_version;
+
+	if (sbi->s_blocks_per_group > blocksize * 8) {
+		printk (KERN_ERR
+			"EXT4-fs: #blocks per group too big: %lu\n",
+			sbi->s_blocks_per_group);
+		goto failed_mount;
+	}
+	if (sbi->s_frags_per_group > blocksize * 8) {
+		printk (KERN_ERR
+			"EXT4-fs: #fragments per group too big: %lu\n",
+			sbi->s_frags_per_group);
+		goto failed_mount;
+	}
+	if (sbi->s_inodes_per_group > blocksize * 8) {
+		printk (KERN_ERR
+			"EXT4-fs: #inodes per group too big: %lu\n",
+			sbi->s_inodes_per_group);
+		goto failed_mount;
+	}
+
+	if (ext4_blocks_count(es) >
+		    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+		printk(KERN_ERR "EXT4-fs: filesystem on %s:"
+			" too large to mount safely\n", sb->s_id);
+		if (sizeof(sector_t) < 8)
+			printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not "
+					"enabled\n");
+		goto failed_mount;
+	}
+
+	if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
+		goto cantfind_ext4;
+	blocks_count = (ext4_blocks_count(es) -
+			le32_to_cpu(es->s_first_data_block) +
+			EXT4_BLOCKS_PER_GROUP(sb) - 1);
+	do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
+	sbi->s_groups_count = blocks_count;
+	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+		   EXT4_DESC_PER_BLOCK(sb);
+	sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
+				    GFP_KERNEL);
+	if (sbi->s_group_desc == NULL) {
+		printk (KERN_ERR "EXT4-fs: not enough memory\n");
+		goto failed_mount;
+	}
+
+	bgl_lock_init(&sbi->s_blockgroup_lock);
+
+	for (i = 0; i < db_count; i++) {
+		block = descriptor_loc(sb, logical_sb_block, i);
+		sbi->s_group_desc[i] = sb_bread(sb, block);
+		if (!sbi->s_group_desc[i]) {
+			printk (KERN_ERR "EXT4-fs: "
+				"can't read group descriptor %d\n", i);
+			db_count = i;
+			goto failed_mount2;
+		}
+	}
+	if (!ext4_check_descriptors (sb)) {
+		printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
+		goto failed_mount2;
+	}
+	sbi->s_gdb_count = db_count;
+	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+	spin_lock_init(&sbi->s_next_gen_lock);
+
+	percpu_counter_init(&sbi->s_freeblocks_counter,
+		ext4_count_free_blocks(sb));
+	percpu_counter_init(&sbi->s_freeinodes_counter,
+		ext4_count_free_inodes(sb));
+	percpu_counter_init(&sbi->s_dirs_counter,
+		ext4_count_dirs(sb));
+
+	/* per fileystem reservation list head & lock */
+	spin_lock_init(&sbi->s_rsv_window_lock);
+	sbi->s_rsv_window_root = RB_ROOT;
+	/* Add a single, static dummy reservation to the start of the
+	 * reservation window list --- it gives us a placeholder for
+	 * append-at-start-of-list which makes the allocation logic
+	 * _much_ simpler. */
+	sbi->s_rsv_window_head.rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+	sbi->s_rsv_window_head.rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
+	sbi->s_rsv_window_head.rsv_alloc_hit = 0;
+	sbi->s_rsv_window_head.rsv_goal_size = 0;
+	ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
+
+	/*
+	 * set up enough so that it can read an inode
+	 */
+	sb->s_op = &ext4_sops;
+	sb->s_export_op = &ext4_export_ops;
+	sb->s_xattr = ext4_xattr_handlers;
+#ifdef CONFIG_QUOTA
+	sb->s_qcop = &ext4_qctl_operations;
+	sb->dq_op = &ext4_quota_operations;
+#endif
+	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+
+	sb->s_root = NULL;
+
+	needs_recovery = (es->s_last_orphan != 0 ||
+			  EXT4_HAS_INCOMPAT_FEATURE(sb,
+				    EXT4_FEATURE_INCOMPAT_RECOVER));
+
+	/*
+	 * The first inode we look at is the journal inode.  Don't try
+	 * root first: it may be modified in the journal!
+	 */
+	if (!test_opt(sb, NOLOAD) &&
+	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+		if (ext4_load_journal(sb, es, journal_devnum))
+			goto failed_mount3;
+	} else if (journal_inum) {
+		if (ext4_create_journal(sb, es, journal_inum))
+			goto failed_mount3;
+	} else {
+		if (!silent)
+			printk (KERN_ERR
+				"ext4: No journal on filesystem on %s\n",
+				sb->s_id);
+		goto failed_mount3;
+	}
+
+	/* We have now updated the journal if required, so we can
+	 * validate the data journaling mode. */
+	switch (test_opt(sb, DATA_FLAGS)) {
+	case 0:
+		/* No mode set, assume a default based on the journal
+		 * capabilities: ORDERED_DATA if the journal can
+		 * cope, else JOURNAL_DATA
+		 */
+		if (jbd2_journal_check_available_features
+		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE))
+			set_opt(sbi->s_mount_opt, ORDERED_DATA);
+		else
+			set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+		break;
+
+	case EXT4_MOUNT_ORDERED_DATA:
+	case EXT4_MOUNT_WRITEBACK_DATA:
+		if (!jbd2_journal_check_available_features
+		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
+			printk(KERN_ERR "EXT4-fs: Journal does not support "
+			       "requested data journaling mode\n");
+			goto failed_mount4;
+		}
+	default:
+		break;
+	}
+
+	if (test_opt(sb, NOBH)) {
+		if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
+			printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - "
+				"its supported only with writeback mode\n");
+			clear_opt(sbi->s_mount_opt, NOBH);
+		}
+	}
+	/*
+	 * The jbd2_journal_load will have done any necessary log recovery,
+	 * so we can safely mount the rest of the filesystem now.
+	 */
+
+	root = iget(sb, EXT4_ROOT_INO);
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		printk(KERN_ERR "EXT4-fs: get root inode failed\n");
+		iput(root);
+		goto failed_mount4;
+	}
+	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+		dput(sb->s_root);
+		sb->s_root = NULL;
+		printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n");
+		goto failed_mount4;
+	}
+
+	ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+	/*
+	 * akpm: core read_super() calls in here with the superblock locked.
+	 * That deadlocks, because orphan cleanup needs to lock the superblock
+	 * in numerous places.  Here we just pop the lock - it's relatively
+	 * harmless, because we are now ready to accept write_super() requests,
+	 * and aviro says that's the only reason for hanging onto the
+	 * superblock lock.
+	 */
+	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
+	ext4_orphan_cleanup(sb, es);
+	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
+	if (needs_recovery)
+		printk (KERN_INFO "EXT4-fs: recovery complete.\n");
+	ext4_mark_recovery_complete(sb, es);
+	printk (KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
+		test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
+		test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
+		"writeback");
+
+	ext4_ext_init(sb);
+
+	lock_kernel();
+	return 0;
+
+cantfind_ext4:
+	if (!silent)
+		printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n",
+		       sb->s_id);
+	goto failed_mount;
+
+failed_mount4:
+	jbd2_journal_destroy(sbi->s_journal);
+failed_mount3:
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+failed_mount2:
+	for (i = 0; i < db_count; i++)
+		brelse(sbi->s_group_desc[i]);
+	kfree(sbi->s_group_desc);
+failed_mount:
+#ifdef CONFIG_QUOTA
+	for (i = 0; i < MAXQUOTAS; i++)
+		kfree(sbi->s_qf_names[i]);
+#endif
+	ext4_blkdev_remove(sbi);
+	brelse(bh);
+out_fail:
+	sb->s_fs_info = NULL;
+	kfree(sbi);
+	lock_kernel();
+	return -EINVAL;
+}
+
+/*
+ * Setup any per-fs journal parameters now.  We'll do this both on
+ * initial mount, once the journal has been initialised but before we've
+ * done any recovery; and again on any subsequent remount.
+ */
+static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (sbi->s_commit_interval)
+		journal->j_commit_interval = sbi->s_commit_interval;
+	/* We could also set up an ext4-specific default for the commit
+	 * interval here, but for now we'll just fall back to the jbd
+	 * default. */
+
+	spin_lock(&journal->j_state_lock);
+	if (test_opt(sb, BARRIER))
+		journal->j_flags |= JBD2_BARRIER;
+	else
+		journal->j_flags &= ~JBD2_BARRIER;
+	spin_unlock(&journal->j_state_lock);
+}
+
+static journal_t *ext4_get_journal(struct super_block *sb,
+				   unsigned int journal_inum)
+{
+	struct inode *journal_inode;
+	journal_t *journal;
+
+	/* First, test for the existence of a valid inode on disk.  Bad
+	 * things happen if we iget() an unused inode, as the subsequent
+	 * iput() will try to delete it. */
+
+	journal_inode = iget(sb, journal_inum);
+	if (!journal_inode) {
+		printk(KERN_ERR "EXT4-fs: no journal found.\n");
+		return NULL;
+	}
+	if (!journal_inode->i_nlink) {
+		make_bad_inode(journal_inode);
+		iput(journal_inode);
+		printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n");
+		return NULL;
+	}
+
+	jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
+		  journal_inode, journal_inode->i_size);
+	if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) {
+		printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
+		iput(journal_inode);
+		return NULL;
+	}
+
+	journal = jbd2_journal_init_inode(journal_inode);
+	if (!journal) {
+		printk(KERN_ERR "EXT4-fs: Could not load journal inode\n");
+		iput(journal_inode);
+		return NULL;
+	}
+	journal->j_private = sb;
+	ext4_init_journal_params(sb, journal);
+	return journal;
+}
+
+static journal_t *ext4_get_dev_journal(struct super_block *sb,
+				       dev_t j_dev)
+{
+	struct buffer_head * bh;
+	journal_t *journal;
+	ext4_fsblk_t start;
+	ext4_fsblk_t len;
+	int hblock, blocksize;
+	ext4_fsblk_t sb_block;
+	unsigned long offset;
+	struct ext4_super_block * es;
+	struct block_device *bdev;
+
+	bdev = ext4_blkdev_get(j_dev);
+	if (bdev == NULL)
+		return NULL;
+
+	if (bd_claim(bdev, sb)) {
+		printk(KERN_ERR
+		        "EXT4: failed to claim external journal device.\n");
+		blkdev_put(bdev);
+		return NULL;
+	}
+
+	blocksize = sb->s_blocksize;
+	hblock = bdev_hardsect_size(bdev);
+	if (blocksize < hblock) {
+		printk(KERN_ERR
+			"EXT4-fs: blocksize too small for journal device.\n");
+		goto out_bdev;
+	}
+
+	sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
+	offset = EXT4_MIN_BLOCK_SIZE % blocksize;
+	set_blocksize(bdev, blocksize);
+	if (!(bh = __bread(bdev, sb_block, blocksize))) {
+		printk(KERN_ERR "EXT4-fs: couldn't read superblock of "
+		       "external journal\n");
+		goto out_bdev;
+	}
+
+	es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+	if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
+	    !(le32_to_cpu(es->s_feature_incompat) &
+	      EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
+		printk(KERN_ERR "EXT4-fs: external journal has "
+					"bad superblock\n");
+		brelse(bh);
+		goto out_bdev;
+	}
+
+	if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
+		printk(KERN_ERR "EXT4-fs: journal UUID does not match\n");
+		brelse(bh);
+		goto out_bdev;
+	}
+
+	len = ext4_blocks_count(es);
+	start = sb_block + 1;
+	brelse(bh);	/* we're done with the superblock */
+
+	journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
+					start, len, blocksize);
+	if (!journal) {
+		printk(KERN_ERR "EXT4-fs: failed to create device journal\n");
+		goto out_bdev;
+	}
+	journal->j_private = sb;
+	ll_rw_block(READ, 1, &journal->j_sb_buffer);
+	wait_on_buffer(journal->j_sb_buffer);
+	if (!buffer_uptodate(journal->j_sb_buffer)) {
+		printk(KERN_ERR "EXT4-fs: I/O error on journal device\n");
+		goto out_journal;
+	}
+	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
+		printk(KERN_ERR "EXT4-fs: External journal has more than one "
+					"user (unsupported) - %d\n",
+			be32_to_cpu(journal->j_superblock->s_nr_users));
+		goto out_journal;
+	}
+	EXT4_SB(sb)->journal_bdev = bdev;
+	ext4_init_journal_params(sb, journal);
+	return journal;
+out_journal:
+	jbd2_journal_destroy(journal);
+out_bdev:
+	ext4_blkdev_put(bdev);
+	return NULL;
+}
+
+static int ext4_load_journal(struct super_block *sb,
+			     struct ext4_super_block *es,
+			     unsigned long journal_devnum)
+{
+	journal_t *journal;
+	unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
+	dev_t journal_dev;
+	int err = 0;
+	int really_read_only;
+
+	if (journal_devnum &&
+	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+		printk(KERN_INFO "EXT4-fs: external journal device major/minor "
+			"numbers have changed\n");
+		journal_dev = new_decode_dev(journal_devnum);
+	} else
+		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
+
+	really_read_only = bdev_read_only(sb->s_bdev);
+
+	/*
+	 * Are we loading a blank journal or performing recovery after a
+	 * crash?  For recovery, we need to check in advance whether we
+	 * can get read-write access to the device.
+	 */
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+		if (sb->s_flags & MS_RDONLY) {
+			printk(KERN_INFO "EXT4-fs: INFO: recovery "
+					"required on readonly filesystem.\n");
+			if (really_read_only) {
+				printk(KERN_ERR "EXT4-fs: write access "
+					"unavailable, cannot proceed.\n");
+				return -EROFS;
+			}
+			printk (KERN_INFO "EXT4-fs: write access will "
+					"be enabled during recovery.\n");
+		}
+	}
+
+	if (journal_inum && journal_dev) {
+		printk(KERN_ERR "EXT4-fs: filesystem has both journal "
+		       "and inode journals!\n");
+		return -EINVAL;
+	}
+
+	if (journal_inum) {
+		if (!(journal = ext4_get_journal(sb, journal_inum)))
+			return -EINVAL;
+	} else {
+		if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
+			return -EINVAL;
+	}
+
+	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
+		err = jbd2_journal_update_format(journal);
+		if (err)  {
+			printk(KERN_ERR "EXT4-fs: error updating journal.\n");
+			jbd2_journal_destroy(journal);
+			return err;
+		}
+	}
+
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
+		err = jbd2_journal_wipe(journal, !really_read_only);
+	if (!err)
+		err = jbd2_journal_load(journal);
+
+	if (err) {
+		printk(KERN_ERR "EXT4-fs: error loading journal.\n");
+		jbd2_journal_destroy(journal);
+		return err;
+	}
+
+	EXT4_SB(sb)->s_journal = journal;
+	ext4_clear_journal_err(sb, es);
+
+	if (journal_devnum &&
+	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+		es->s_journal_dev = cpu_to_le32(journal_devnum);
+		sb->s_dirt = 1;
+
+		/* Make sure we flush the recovery flag to disk. */
+		ext4_commit_super(sb, es, 1);
+	}
+
+	return 0;
+}
+
+static int ext4_create_journal(struct super_block * sb,
+			       struct ext4_super_block * es,
+			       unsigned int journal_inum)
+{
+	journal_t *journal;
+
+	if (sb->s_flags & MS_RDONLY) {
+		printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
+				"create journal.\n");
+		return -EROFS;
+	}
+
+	if (!(journal = ext4_get_journal(sb, journal_inum)))
+		return -EINVAL;
+
+	printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
+	       journal_inum);
+
+	if (jbd2_journal_create(journal)) {
+		printk(KERN_ERR "EXT4-fs: error creating journal.\n");
+		jbd2_journal_destroy(journal);
+		return -EIO;
+	}
+
+	EXT4_SB(sb)->s_journal = journal;
+
+	ext4_update_dynamic_rev(sb);
+	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+	EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
+
+	es->s_journal_inum = cpu_to_le32(journal_inum);
+	sb->s_dirt = 1;
+
+	/* Make sure we flush the recovery flag to disk. */
+	ext4_commit_super(sb, es, 1);
+
+	return 0;
+}
+
+static void ext4_commit_super (struct super_block * sb,
+			       struct ext4_super_block * es,
+			       int sync)
+{
+	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
+
+	if (!sbh)
+		return;
+	es->s_wtime = cpu_to_le32(get_seconds());
+	ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
+	es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
+	BUFFER_TRACE(sbh, "marking dirty");
+	mark_buffer_dirty(sbh);
+	if (sync)
+		sync_dirty_buffer(sbh);
+}
+
+
+/*
+ * Have we just finished recovery?  If so, and if we are mounting (or
+ * remounting) the filesystem readonly, then we will end up with a
+ * consistent fs on disk.  Record that fact.
+ */
+static void ext4_mark_recovery_complete(struct super_block * sb,
+					struct ext4_super_block * es)
+{
+	journal_t *journal = EXT4_SB(sb)->s_journal;
+
+	jbd2_journal_lock_updates(journal);
+	jbd2_journal_flush(journal);
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
+	    sb->s_flags & MS_RDONLY) {
+		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+		sb->s_dirt = 0;
+		ext4_commit_super(sb, es, 1);
+	}
+	jbd2_journal_unlock_updates(journal);
+}
+
+/*
+ * If we are mounting (or read-write remounting) a filesystem whose journal
+ * has recorded an error from a previous lifetime, move that error to the
+ * main filesystem now.
+ */
+static void ext4_clear_journal_err(struct super_block * sb,
+				   struct ext4_super_block * es)
+{
+	journal_t *journal;
+	int j_errno;
+	const char *errstr;
+
+	journal = EXT4_SB(sb)->s_journal;
+
+	/*
+	 * Now check for any error status which may have been recorded in the
+	 * journal by a prior ext4_error() or ext4_abort()
+	 */
+
+	j_errno = jbd2_journal_errno(journal);
+	if (j_errno) {
+		char nbuf[16];
+
+		errstr = ext4_decode_error(sb, j_errno, nbuf);
+		ext4_warning(sb, __FUNCTION__, "Filesystem error recorded "
+			     "from previous mount: %s", errstr);
+		ext4_warning(sb, __FUNCTION__, "Marking fs in need of "
+			     "filesystem check.");
+
+		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+		ext4_commit_super (sb, es, 1);
+
+		jbd2_journal_clear_err(journal);
+	}
+}
+
+/*
+ * Force the running and committing transactions to commit,
+ * and wait on the commit.
+ */
+int ext4_force_commit(struct super_block *sb)
+{
+	journal_t *journal;
+	int ret;
+
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	journal = EXT4_SB(sb)->s_journal;
+	sb->s_dirt = 0;
+	ret = ext4_journal_force_commit(journal);
+	return ret;
+}
+
+/*
+ * Ext4 always journals updates to the superblock itself, so we don't
+ * have to propagate any other updates to the superblock on disk at this
+ * point.  Just start an async writeback to get the buffers on their way
+ * to the disk.
+ *
+ * This implicitly triggers the writebehind on sync().
+ */
+
+static void ext4_write_super (struct super_block * sb)
+{
+	if (mutex_trylock(&sb->s_lock) != 0)
+		BUG();
+	sb->s_dirt = 0;
+}
+
+static int ext4_sync_fs(struct super_block *sb, int wait)
+{
+	tid_t target;
+
+	sb->s_dirt = 0;
+	if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
+		if (wait)
+			jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
+	}
+	return 0;
+}
+
+/*
+ * LVM calls this function before a (read-only) snapshot is created.  This
+ * gives us a chance to flush the journal completely and mark the fs clean.
+ */
+static void ext4_write_super_lockfs(struct super_block *sb)
+{
+	sb->s_dirt = 0;
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		journal_t *journal = EXT4_SB(sb)->s_journal;
+
+		/* Now we set up the journal barrier. */
+		jbd2_journal_lock_updates(journal);
+		jbd2_journal_flush(journal);
+
+		/* Journal blocked and flushed, clear needs_recovery flag. */
+		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+	}
+}
+
+/*
+ * Called by LVM after the snapshot is done.  We need to reset the RECOVER
+ * flag here, even though the filesystem is not technically dirty yet.
+ */
+static void ext4_unlockfs(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY)) {
+		lock_super(sb);
+		/* Reser the needs_recovery flag before the fs is unlocked. */
+		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+		unlock_super(sb);
+		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+	}
+}
+
+static int ext4_remount (struct super_block * sb, int * flags, char * data)
+{
+	struct ext4_super_block * es;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	ext4_fsblk_t n_blocks_count = 0;
+	unsigned long old_sb_flags;
+	struct ext4_mount_options old_opts;
+	int err;
+#ifdef CONFIG_QUOTA
+	int i;
+#endif
+
+	/* Store the original options */
+	old_sb_flags = sb->s_flags;
+	old_opts.s_mount_opt = sbi->s_mount_opt;
+	old_opts.s_resuid = sbi->s_resuid;
+	old_opts.s_resgid = sbi->s_resgid;
+	old_opts.s_commit_interval = sbi->s_commit_interval;
+#ifdef CONFIG_QUOTA
+	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
+	for (i = 0; i < MAXQUOTAS; i++)
+		old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+#endif
+
+	/*
+	 * Allow the "check" option to be passed as a remount option.
+	 */
+	if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
+		err = -EINVAL;
+		goto restore_opts;
+	}
+
+	if (sbi->s_mount_opt & EXT4_MOUNT_ABORT)
+		ext4_abort(sb, __FUNCTION__, "Abort forced by user");
+
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+
+	es = sbi->s_es;
+
+	ext4_init_journal_params(sb, sbi->s_journal);
+
+	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
+		n_blocks_count > ext4_blocks_count(es)) {
+		if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) {
+			err = -EROFS;
+			goto restore_opts;
+		}
+
+		if (*flags & MS_RDONLY) {
+			/*
+			 * First of all, the unconditional stuff we have to do
+			 * to disable replay of the journal when we next remount
+			 */
+			sb->s_flags |= MS_RDONLY;
+
+			/*
+			 * OK, test if we are remounting a valid rw partition
+			 * readonly, and if so set the rdonly flag and then
+			 * mark the partition as valid again.
+			 */
+			if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
+			    (sbi->s_mount_state & EXT4_VALID_FS))
+				es->s_state = cpu_to_le16(sbi->s_mount_state);
+
+			ext4_mark_recovery_complete(sb, es);
+		} else {
+			__le32 ret;
+			if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+					~EXT4_FEATURE_RO_COMPAT_SUPP))) {
+				printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+				       "remount RDWR because of unsupported "
+				       "optional features (%x).\n",
+				       sb->s_id, le32_to_cpu(ret));
+				err = -EROFS;
+				goto restore_opts;
+			}
+			/*
+			 * Mounting a RDONLY partition read-write, so reread
+			 * and store the current valid flag.  (It may have
+			 * been changed by e2fsck since we originally mounted
+			 * the partition.)
+			 */
+			ext4_clear_journal_err(sb, es);
+			sbi->s_mount_state = le16_to_cpu(es->s_state);
+			if ((err = ext4_group_extend(sb, es, n_blocks_count)))
+				goto restore_opts;
+			if (!ext4_setup_super (sb, es, 0))
+				sb->s_flags &= ~MS_RDONLY;
+		}
+	}
+#ifdef CONFIG_QUOTA
+	/* Release old quota file names */
+	for (i = 0; i < MAXQUOTAS; i++)
+		if (old_opts.s_qf_names[i] &&
+		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
+			kfree(old_opts.s_qf_names[i]);
+#endif
+	return 0;
+restore_opts:
+	sb->s_flags = old_sb_flags;
+	sbi->s_mount_opt = old_opts.s_mount_opt;
+	sbi->s_resuid = old_opts.s_resuid;
+	sbi->s_resgid = old_opts.s_resgid;
+	sbi->s_commit_interval = old_opts.s_commit_interval;
+#ifdef CONFIG_QUOTA
+	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (sbi->s_qf_names[i] &&
+		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
+			kfree(sbi->s_qf_names[i]);
+		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
+	}
+#endif
+	return err;
+}
+
+static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	ext4_fsblk_t overhead;
+	int i;
+
+	if (test_opt (sb, MINIX_DF))
+		overhead = 0;
+	else {
+		unsigned long ngroups;
+		ngroups = EXT4_SB(sb)->s_groups_count;
+		smp_rmb();
+
+		/*
+		 * Compute the overhead (FS structures)
+		 */
+
+		/*
+		 * All of the blocks before first_data_block are
+		 * overhead
+		 */
+		overhead = le32_to_cpu(es->s_first_data_block);
+
+		/*
+		 * Add the overhead attributed to the superblock and
+		 * block group descriptors.  If the sparse superblocks
+		 * feature is turned on, then not all groups have this.
+		 */
+		for (i = 0; i < ngroups; i++) {
+			overhead += ext4_bg_has_super(sb, i) +
+				ext4_bg_num_gdb(sb, i);
+			cond_resched();
+		}
+
+		/*
+		 * Every block group has an inode bitmap, a block
+		 * bitmap, and an inode table.
+		 */
+		overhead += (ngroups * (2 + EXT4_SB(sb)->s_itb_per_group));
+	}
+
+	buf->f_type = EXT4_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = ext4_blocks_count(es) - overhead;
+	buf->f_bfree = percpu_counter_sum(&sbi->s_freeblocks_counter);
+	buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
+	if (buf->f_bfree < ext4_r_blocks_count(es))
+		buf->f_bavail = 0;
+	buf->f_files = le32_to_cpu(es->s_inodes_count);
+	buf->f_ffree = percpu_counter_sum(&sbi->s_freeinodes_counter);
+	buf->f_namelen = EXT4_NAME_LEN;
+	return 0;
+}
+
+/* Helper function for writing quotas on sync - we need to start transaction before quota file
+ * is locked for write. Otherwise the are possible deadlocks:
+ * Process 1                         Process 2
+ * ext4_create()                     quota_sync()
+ *   jbd2_journal_start()                   write_dquot()
+ *   DQUOT_INIT()                        down(dqio_mutex)
+ *     down(dqio_mutex)                    jbd2_journal_start()
+ *
+ */
+
+#ifdef CONFIG_QUOTA
+
+static inline struct inode *dquot_to_inode(struct dquot *dquot)
+{
+	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
+}
+
+static int ext4_dquot_initialize(struct inode *inode, int type)
+{
+	handle_t *handle;
+	int ret, err;
+
+	/* We may create quota structure so we need to reserve enough blocks */
+	handle = ext4_journal_start(inode, 2*EXT4_QUOTA_INIT_BLOCKS(inode->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ret = dquot_initialize(inode, type);
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+static int ext4_dquot_drop(struct inode *inode)
+{
+	handle_t *handle;
+	int ret, err;
+
+	/* We may delete quota structure so we need to reserve enough blocks */
+	handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ret = dquot_drop(inode);
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+static int ext4_write_dquot(struct dquot *dquot)
+{
+	int ret, err;
+	handle_t *handle;
+	struct inode *inode;
+
+	inode = dquot_to_inode(dquot);
+	handle = ext4_journal_start(inode,
+					EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ret = dquot_commit(dquot);
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+static int ext4_acquire_dquot(struct dquot *dquot)
+{
+	int ret, err;
+	handle_t *handle;
+
+	handle = ext4_journal_start(dquot_to_inode(dquot),
+					EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ret = dquot_acquire(dquot);
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+static int ext4_release_dquot(struct dquot *dquot)
+{
+	int ret, err;
+	handle_t *handle;
+
+	handle = ext4_journal_start(dquot_to_inode(dquot),
+					EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ret = dquot_release(dquot);
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+static int ext4_mark_dquot_dirty(struct dquot *dquot)
+{
+	/* Are we journalling quotas? */
+	if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
+	    EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
+		dquot_mark_dquot_dirty(dquot);
+		return ext4_write_dquot(dquot);
+	} else {
+		return dquot_mark_dquot_dirty(dquot);
+	}
+}
+
+static int ext4_write_info(struct super_block *sb, int type)
+{
+	int ret, err;
+	handle_t *handle;
+
+	/* Data block + inode block */
+	handle = ext4_journal_start(sb->s_root->d_inode, 2);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ret = dquot_commit_info(sb, type);
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+/*
+ * Turn on quotas during mount time - we need to find
+ * the quota file and such...
+ */
+static int ext4_quota_on_mount(struct super_block *sb, int type)
+{
+	return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
+			EXT4_SB(sb)->s_jquota_fmt, type);
+}
+
+/*
+ * Standard function to be called on quota_on
+ */
+static int ext4_quota_on(struct super_block *sb, int type, int format_id,
+			 char *path)
+{
+	int err;
+	struct nameidata nd;
+
+	if (!test_opt(sb, QUOTA))
+		return -EINVAL;
+	/* Not journalling quota? */
+	if (!EXT4_SB(sb)->s_qf_names[USRQUOTA] &&
+	    !EXT4_SB(sb)->s_qf_names[GRPQUOTA])
+		return vfs_quota_on(sb, type, format_id, path);
+	err = path_lookup(path, LOOKUP_FOLLOW, &nd);
+	if (err)
+		return err;
+	/* Quotafile not on the same filesystem? */
+	if (nd.mnt->mnt_sb != sb) {
+		path_release(&nd);
+		return -EXDEV;
+	}
+	/* Quotafile not of fs root? */
+	if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode)
+		printk(KERN_WARNING
+			"EXT4-fs: Quota file not on filesystem root. "
+			"Journalled quota will not work.\n");
+	path_release(&nd);
+	return vfs_quota_on(sb, type, format_id, path);
+}
+
+/* Read data from quotafile - avoid pagecache and such because we cannot afford
+ * acquiring the locks... As quota files are never truncated and quota code
+ * itself serializes the operations (and noone else should touch the files)
+ * we don't have to be afraid of races */
+static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
+			       size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
+	int err = 0;
+	int offset = off & (sb->s_blocksize - 1);
+	int tocopy;
+	size_t toread;
+	struct buffer_head *bh;
+	loff_t i_size = i_size_read(inode);
+
+	if (off > i_size)
+		return 0;
+	if (off+len > i_size)
+		len = i_size-off;
+	toread = len;
+	while (toread > 0) {
+		tocopy = sb->s_blocksize - offset < toread ?
+				sb->s_blocksize - offset : toread;
+		bh = ext4_bread(NULL, inode, blk, 0, &err);
+		if (err)
+			return err;
+		if (!bh)	/* A hole? */
+			memset(data, 0, tocopy);
+		else
+			memcpy(data, bh->b_data+offset, tocopy);
+		brelse(bh);
+		offset = 0;
+		toread -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+	return len;
+}
+
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+static ssize_t ext4_quota_write(struct super_block *sb, int type,
+				const char *data, size_t len, loff_t off)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	sector_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
+	int err = 0;
+	int offset = off & (sb->s_blocksize - 1);
+	int tocopy;
+	int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
+	size_t towrite = len;
+	struct buffer_head *bh;
+	handle_t *handle = journal_current_handle();
+
+	mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
+	while (towrite > 0) {
+		tocopy = sb->s_blocksize - offset < towrite ?
+				sb->s_blocksize - offset : towrite;
+		bh = ext4_bread(handle, inode, blk, 1, &err);
+		if (!bh)
+			goto out;
+		if (journal_quota) {
+			err = ext4_journal_get_write_access(handle, bh);
+			if (err) {
+				brelse(bh);
+				goto out;
+			}
+		}
+		lock_buffer(bh);
+		memcpy(bh->b_data+offset, data, tocopy);
+		flush_dcache_page(bh->b_page);
+		unlock_buffer(bh);
+		if (journal_quota)
+			err = ext4_journal_dirty_metadata(handle, bh);
+		else {
+			/* Always do at least ordered writes for quotas */
+			err = ext4_journal_dirty_data(handle, bh);
+			mark_buffer_dirty(bh);
+		}
+		brelse(bh);
+		if (err)
+			goto out;
+		offset = 0;
+		towrite -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+out:
+	if (len == towrite)
+		return err;
+	if (inode->i_size < off+len-towrite) {
+		i_size_write(inode, off+len-towrite);
+		EXT4_I(inode)->i_disksize = inode->i_size;
+	}
+	inode->i_version++;
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	ext4_mark_inode_dirty(handle, inode);
+	mutex_unlock(&inode->i_mutex);
+	return len - towrite;
+}
+
+#endif
+
+static int ext4_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+}
+
+static struct file_system_type ext4dev_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "ext4dev",
+	.get_sb		= ext4_get_sb,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_ext4_fs(void)
+{
+	int err = init_ext4_xattr();
+	if (err)
+		return err;
+	err = init_inodecache();
+	if (err)
+		goto out1;
+	err = register_filesystem(&ext4dev_fs_type);
+	if (err)
+		goto out;
+	return 0;
+out:
+	destroy_inodecache();
+out1:
+	exit_ext4_xattr();
+	return err;
+}
+
+static void __exit exit_ext4_fs(void)
+{
+	unregister_filesystem(&ext4dev_fs_type);
+	destroy_inodecache();
+	exit_ext4_xattr();
+}
+
+MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
+MODULE_LICENSE("GPL");
+module_init(init_ext4_fs)
+module_exit(exit_ext4_fs)
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
new file mode 100644
index 000000000000..fcf527286d75
--- /dev/null
+++ b/fs/ext4/symlink.c
@@ -0,0 +1,54 @@
+/*
+ *  linux/fs/ext4/symlink.c
+ *
+ * Only fast symlinks left here - the rest is done by generic code. AV, 1999
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/symlink.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext4 symlink handling code
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/namei.h>
+#include "xattr.h"
+
+static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
+	nd_set_link(nd, (char*)ei->i_data);
+	return NULL;
+}
+
+struct inode_operations ext4_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext4_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+};
+
+struct inode_operations ext4_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= ext4_follow_link,
+#ifdef CONFIG_EXT4DEV_FS_XATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= ext4_listxattr,
+	.removexattr	= generic_removexattr,
+#endif
+};
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
new file mode 100644
index 000000000000..63233cd946a7
--- /dev/null
+++ b/fs/ext4/xattr.c
@@ -0,0 +1,1317 @@
+/*
+ * linux/fs/ext4/xattr.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
+ * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>.
+ * Extended attributes for symlinks and special files added per
+ *  suggestion of Luka Renko <luka.renko@hermes.si>.
+ * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
+ *  Red Hat Inc.
+ * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz
+ *  and Andreas Gruenbacher <agruen@suse.de>.
+ */
+
+/*
+ * Extended attributes are stored directly in inodes (on file systems with
+ * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl
+ * field contains the block number if an inode uses an additional block. All
+ * attributes must fit in the inode and one additional block. Blocks that
+ * contain the identical set of attributes may be shared among several inodes.
+ * Identical blocks are detected by keeping a cache of blocks that have
+ * recently been accessed.
+ *
+ * The attributes in inodes and on blocks have a different header; the entries
+ * are stored in the same format:
+ *
+ *   +------------------+
+ *   | header           |
+ *   | entry 1          | |
+ *   | entry 2          | | growing downwards
+ *   | entry 3          | v
+ *   | four null bytes  |
+ *   | . . .            |
+ *   | value 1          | ^
+ *   | value 3          | | growing upwards
+ *   | value 2          | |
+ *   +------------------+
+ *
+ * The header is followed by multiple entry descriptors. In disk blocks, the
+ * entry descriptors are kept sorted. In inodes, they are unsorted. The
+ * attribute values are aligned to the end of the block in no specific order.
+ *
+ * Locking strategy
+ * ----------------
+ * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem.
+ * EA blocks are only changed if they are exclusive to an inode, so
+ * holding xattr_sem also means that nothing but the EA block's reference
+ * count can change. Multiple writers to the same block are synchronized
+ * by the buffer lock.
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/mbcache.h>
+#include <linux/quotaops.h>
+#include <linux/rwsem.h>
+#include "xattr.h"
+#include "acl.h"
+
+#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
+#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
+#define BFIRST(bh) ENTRY(BHDR(bh)+1)
+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
+
+#define IHDR(inode, raw_inode) \
+	((struct ext4_xattr_ibody_header *) \
+		((void *)raw_inode + \
+		 EXT4_GOOD_OLD_INODE_SIZE + \
+		 EXT4_I(inode)->i_extra_isize))
+#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
+
+#ifdef EXT4_XATTR_DEBUG
+# define ea_idebug(inode, f...) do { \
+		printk(KERN_DEBUG "inode %s:%lu: ", \
+			inode->i_sb->s_id, inode->i_ino); \
+		printk(f); \
+		printk("\n"); \
+	} while (0)
+# define ea_bdebug(bh, f...) do { \
+		char b[BDEVNAME_SIZE]; \
+		printk(KERN_DEBUG "block %s:%lu: ", \
+			bdevname(bh->b_bdev, b), \
+			(unsigned long) bh->b_blocknr); \
+		printk(f); \
+		printk("\n"); \
+	} while (0)
+#else
+# define ea_idebug(f...)
+# define ea_bdebug(f...)
+#endif
+
+static void ext4_xattr_cache_insert(struct buffer_head *);
+static struct buffer_head *ext4_xattr_cache_find(struct inode *,
+						 struct ext4_xattr_header *,
+						 struct mb_cache_entry **);
+static void ext4_xattr_rehash(struct ext4_xattr_header *,
+			      struct ext4_xattr_entry *);
+
+static struct mb_cache *ext4_xattr_cache;
+
+static struct xattr_handler *ext4_xattr_handler_map[] = {
+	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+	[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext4_xattr_acl_access_handler,
+	[EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
+#endif
+	[EXT4_XATTR_INDEX_TRUSTED]	     = &ext4_xattr_trusted_handler,
+#ifdef CONFIG_EXT4DEV_FS_SECURITY
+	[EXT4_XATTR_INDEX_SECURITY]	     = &ext4_xattr_security_handler,
+#endif
+};
+
+struct xattr_handler *ext4_xattr_handlers[] = {
+	&ext4_xattr_user_handler,
+	&ext4_xattr_trusted_handler,
+#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+	&ext4_xattr_acl_access_handler,
+	&ext4_xattr_acl_default_handler,
+#endif
+#ifdef CONFIG_EXT4DEV_FS_SECURITY
+	&ext4_xattr_security_handler,
+#endif
+	NULL
+};
+
+static inline struct xattr_handler *
+ext4_xattr_handler(int name_index)
+{
+	struct xattr_handler *handler = NULL;
+
+	if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
+		handler = ext4_xattr_handler_map[name_index];
+	return handler;
+}
+
+/*
+ * Inode operation listxattr()
+ *
+ * dentry->d_inode->i_mutex: don't care
+ */
+ssize_t
+ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	return ext4_xattr_list(dentry->d_inode, buffer, size);
+}
+
+static int
+ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
+{
+	while (!IS_LAST_ENTRY(entry)) {
+		struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry);
+		if ((void *)next >= end)
+			return -EIO;
+		entry = next;
+	}
+	return 0;
+}
+
+static inline int
+ext4_xattr_check_block(struct buffer_head *bh)
+{
+	int error;
+
+	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
+	    BHDR(bh)->h_blocks != cpu_to_le32(1))
+		return -EIO;
+	error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+	return error;
+}
+
+static inline int
+ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size)
+{
+	size_t value_size = le32_to_cpu(entry->e_value_size);
+
+	if (entry->e_value_block != 0 || value_size > size ||
+	    le16_to_cpu(entry->e_value_offs) + value_size > size)
+		return -EIO;
+	return 0;
+}
+
+static int
+ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
+		      const char *name, size_t size, int sorted)
+{
+	struct ext4_xattr_entry *entry;
+	size_t name_len;
+	int cmp = 1;
+
+	if (name == NULL)
+		return -EINVAL;
+	name_len = strlen(name);
+	entry = *pentry;
+	for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+		cmp = name_index - entry->e_name_index;
+		if (!cmp)
+			cmp = name_len - entry->e_name_len;
+		if (!cmp)
+			cmp = memcmp(name, entry->e_name, name_len);
+		if (cmp <= 0 && (sorted || cmp == 0))
+			break;
+	}
+	*pentry = entry;
+	if (!cmp && ext4_xattr_check_entry(entry, size))
+			return -EIO;
+	return cmp ? -ENODATA : 0;
+}
+
+static int
+ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
+		     void *buffer, size_t buffer_size)
+{
+	struct buffer_head *bh = NULL;
+	struct ext4_xattr_entry *entry;
+	size_t size;
+	int error;
+
+	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
+		  name_index, name, buffer, (long)buffer_size);
+
+	error = -ENODATA;
+	if (!EXT4_I(inode)->i_file_acl)
+		goto cleanup;
+	ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+	if (!bh)
+		goto cleanup;
+	ea_bdebug(bh, "b_count=%d, refcount=%d",
+		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
+	if (ext4_xattr_check_block(bh)) {
+bad_block:	ext4_error(inode->i_sb, __FUNCTION__,
+			   "inode %lu: bad block %llu", inode->i_ino,
+			   EXT4_I(inode)->i_file_acl);
+		error = -EIO;
+		goto cleanup;
+	}
+	ext4_xattr_cache_insert(bh);
+	entry = BFIRST(bh);
+	error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
+	if (error == -EIO)
+		goto bad_block;
+	if (error)
+		goto cleanup;
+	size = le32_to_cpu(entry->e_value_size);
+	if (buffer) {
+		error = -ERANGE;
+		if (size > buffer_size)
+			goto cleanup;
+		memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
+		       size);
+	}
+	error = size;
+
+cleanup:
+	brelse(bh);
+	return error;
+}
+
+static int
+ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
+		     void *buffer, size_t buffer_size)
+{
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_xattr_entry *entry;
+	struct ext4_inode *raw_inode;
+	struct ext4_iloc iloc;
+	size_t size;
+	void *end;
+	int error;
+
+	if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+		return -ENODATA;
+	error = ext4_get_inode_loc(inode, &iloc);
+	if (error)
+		return error;
+	raw_inode = ext4_raw_inode(&iloc);
+	header = IHDR(inode, raw_inode);
+	entry = IFIRST(header);
+	end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+	error = ext4_xattr_check_names(entry, end);
+	if (error)
+		goto cleanup;
+	error = ext4_xattr_find_entry(&entry, name_index, name,
+				      end - (void *)entry, 0);
+	if (error)
+		goto cleanup;
+	size = le32_to_cpu(entry->e_value_size);
+	if (buffer) {
+		error = -ERANGE;
+		if (size > buffer_size)
+			goto cleanup;
+		memcpy(buffer, (void *)IFIRST(header) +
+		       le16_to_cpu(entry->e_value_offs), size);
+	}
+	error = size;
+
+cleanup:
+	brelse(iloc.bh);
+	return error;
+}
+
+/*
+ * ext4_xattr_get()
+ *
+ * Copy an extended attribute into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+int
+ext4_xattr_get(struct inode *inode, int name_index, const char *name,
+	       void *buffer, size_t buffer_size)
+{
+	int error;
+
+	down_read(&EXT4_I(inode)->xattr_sem);
+	error = ext4_xattr_ibody_get(inode, name_index, name, buffer,
+				     buffer_size);
+	if (error == -ENODATA)
+		error = ext4_xattr_block_get(inode, name_index, name, buffer,
+					     buffer_size);
+	up_read(&EXT4_I(inode)->xattr_sem);
+	return error;
+}
+
+static int
+ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
+			char *buffer, size_t buffer_size)
+{
+	size_t rest = buffer_size;
+
+	for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+		struct xattr_handler *handler =
+			ext4_xattr_handler(entry->e_name_index);
+
+		if (handler) {
+			size_t size = handler->list(inode, buffer, rest,
+						    entry->e_name,
+						    entry->e_name_len);
+			if (buffer) {
+				if (size > rest)
+					return -ERANGE;
+				buffer += size;
+			}
+			rest -= size;
+		}
+	}
+	return buffer_size - rest;
+}
+
+static int
+ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
+{
+	struct buffer_head *bh = NULL;
+	int error;
+
+	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
+		  buffer, (long)buffer_size);
+
+	error = 0;
+	if (!EXT4_I(inode)->i_file_acl)
+		goto cleanup;
+	ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+	error = -EIO;
+	if (!bh)
+		goto cleanup;
+	ea_bdebug(bh, "b_count=%d, refcount=%d",
+		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
+	if (ext4_xattr_check_block(bh)) {
+		ext4_error(inode->i_sb, __FUNCTION__,
+			   "inode %lu: bad block %llu", inode->i_ino,
+			   EXT4_I(inode)->i_file_acl);
+		error = -EIO;
+		goto cleanup;
+	}
+	ext4_xattr_cache_insert(bh);
+	error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size);
+
+cleanup:
+	brelse(bh);
+
+	return error;
+}
+
+static int
+ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
+{
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_inode *raw_inode;
+	struct ext4_iloc iloc;
+	void *end;
+	int error;
+
+	if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
+		return 0;
+	error = ext4_get_inode_loc(inode, &iloc);
+	if (error)
+		return error;
+	raw_inode = ext4_raw_inode(&iloc);
+	header = IHDR(inode, raw_inode);
+	end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+	error = ext4_xattr_check_names(IFIRST(header), end);
+	if (error)
+		goto cleanup;
+	error = ext4_xattr_list_entries(inode, IFIRST(header),
+					buffer, buffer_size);
+
+cleanup:
+	brelse(iloc.bh);
+	return error;
+}
+
+/*
+ * ext4_xattr_list()
+ *
+ * Copy a list of attribute names into the buffer
+ * provided, or compute the buffer size required.
+ * Buffer is NULL to compute the size of the buffer required.
+ *
+ * Returns a negative error number on failure, or the number of bytes
+ * used / required on success.
+ */
+int
+ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
+{
+	int i_error, b_error;
+
+	down_read(&EXT4_I(inode)->xattr_sem);
+	i_error = ext4_xattr_ibody_list(inode, buffer, buffer_size);
+	if (i_error < 0) {
+		b_error = 0;
+	} else {
+		if (buffer) {
+			buffer += i_error;
+			buffer_size -= i_error;
+		}
+		b_error = ext4_xattr_block_list(inode, buffer, buffer_size);
+		if (b_error < 0)
+			i_error = 0;
+	}
+	up_read(&EXT4_I(inode)->xattr_sem);
+	return i_error + b_error;
+}
+
+/*
+ * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is
+ * not set, set it.
+ */
+static void ext4_xattr_update_super_block(handle_t *handle,
+					  struct super_block *sb)
+{
+	if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))
+		return;
+
+	lock_super(sb);
+	if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
+		EXT4_SB(sb)->s_es->s_feature_compat |=
+			cpu_to_le32(EXT4_FEATURE_COMPAT_EXT_ATTR);
+		sb->s_dirt = 1;
+		ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+	}
+	unlock_super(sb);
+}
+
+/*
+ * Release the xattr block BH: If the reference count is > 1, decrement
+ * it; otherwise free the block.
+ */
+static void
+ext4_xattr_release_block(handle_t *handle, struct inode *inode,
+			 struct buffer_head *bh)
+{
+	struct mb_cache_entry *ce = NULL;
+
+	ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
+	if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
+		ea_bdebug(bh, "refcount now=0; freeing");
+		if (ce)
+			mb_cache_entry_free(ce);
+		ext4_free_blocks(handle, inode, bh->b_blocknr, 1);
+		get_bh(bh);
+		ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
+	} else {
+		if (ext4_journal_get_write_access(handle, bh) == 0) {
+			lock_buffer(bh);
+			BHDR(bh)->h_refcount = cpu_to_le32(
+				le32_to_cpu(BHDR(bh)->h_refcount) - 1);
+			ext4_journal_dirty_metadata(handle, bh);
+			if (IS_SYNC(inode))
+				handle->h_sync = 1;
+			DQUOT_FREE_BLOCK(inode, 1);
+			unlock_buffer(bh);
+			ea_bdebug(bh, "refcount now=%d; releasing",
+				  le32_to_cpu(BHDR(bh)->h_refcount));
+		}
+		if (ce)
+			mb_cache_entry_release(ce);
+	}
+}
+
+struct ext4_xattr_info {
+	int name_index;
+	const char *name;
+	const void *value;
+	size_t value_len;
+};
+
+struct ext4_xattr_search {
+	struct ext4_xattr_entry *first;
+	void *base;
+	void *end;
+	struct ext4_xattr_entry *here;
+	int not_found;
+};
+
+static int
+ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
+{
+	struct ext4_xattr_entry *last;
+	size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
+
+	/* Compute min_offs and last. */
+	last = s->first;
+	for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+		if (!last->e_value_block && last->e_value_size) {
+			size_t offs = le16_to_cpu(last->e_value_offs);
+			if (offs < min_offs)
+				min_offs = offs;
+		}
+	}
+	free = min_offs - ((void *)last - s->base) - sizeof(__u32);
+	if (!s->not_found) {
+		if (!s->here->e_value_block && s->here->e_value_size) {
+			size_t size = le32_to_cpu(s->here->e_value_size);
+			free += EXT4_XATTR_SIZE(size);
+		}
+		free += EXT4_XATTR_LEN(name_len);
+	}
+	if (i->value) {
+		if (free < EXT4_XATTR_SIZE(i->value_len) ||
+		    free < EXT4_XATTR_LEN(name_len) +
+			   EXT4_XATTR_SIZE(i->value_len))
+			return -ENOSPC;
+	}
+
+	if (i->value && s->not_found) {
+		/* Insert the new name. */
+		size_t size = EXT4_XATTR_LEN(name_len);
+		size_t rest = (void *)last - (void *)s->here + sizeof(__u32);
+		memmove((void *)s->here + size, s->here, rest);
+		memset(s->here, 0, size);
+		s->here->e_name_index = i->name_index;
+		s->here->e_name_len = name_len;
+		memcpy(s->here->e_name, i->name, name_len);
+	} else {
+		if (!s->here->e_value_block && s->here->e_value_size) {
+			void *first_val = s->base + min_offs;
+			size_t offs = le16_to_cpu(s->here->e_value_offs);
+			void *val = s->base + offs;
+			size_t size = EXT4_XATTR_SIZE(
+				le32_to_cpu(s->here->e_value_size));
+
+			if (i->value && size == EXT4_XATTR_SIZE(i->value_len)) {
+				/* The old and the new value have the same
+				   size. Just replace. */
+				s->here->e_value_size =
+					cpu_to_le32(i->value_len);
+				memset(val + size - EXT4_XATTR_PAD, 0,
+				       EXT4_XATTR_PAD); /* Clear pad bytes. */
+				memcpy(val, i->value, i->value_len);
+				return 0;
+			}
+
+			/* Remove the old value. */
+			memmove(first_val + size, first_val, val - first_val);
+			memset(first_val, 0, size);
+			s->here->e_value_size = 0;
+			s->here->e_value_offs = 0;
+			min_offs += size;
+
+			/* Adjust all value offsets. */
+			last = s->first;
+			while (!IS_LAST_ENTRY(last)) {
+				size_t o = le16_to_cpu(last->e_value_offs);
+				if (!last->e_value_block &&
+				    last->e_value_size && o < offs)
+					last->e_value_offs =
+						cpu_to_le16(o + size);
+				last = EXT4_XATTR_NEXT(last);
+			}
+		}
+		if (!i->value) {
+			/* Remove the old name. */
+			size_t size = EXT4_XATTR_LEN(name_len);
+			last = ENTRY((void *)last - size);
+			memmove(s->here, (void *)s->here + size,
+				(void *)last - (void *)s->here + sizeof(__u32));
+			memset(last, 0, size);
+		}
+	}
+
+	if (i->value) {
+		/* Insert the new value. */
+		s->here->e_value_size = cpu_to_le32(i->value_len);
+		if (i->value_len) {
+			size_t size = EXT4_XATTR_SIZE(i->value_len);
+			void *val = s->base + min_offs - size;
+			s->here->e_value_offs = cpu_to_le16(min_offs - size);
+			memset(val + size - EXT4_XATTR_PAD, 0,
+			       EXT4_XATTR_PAD); /* Clear the pad bytes. */
+			memcpy(val, i->value, i->value_len);
+		}
+	}
+	return 0;
+}
+
+struct ext4_xattr_block_find {
+	struct ext4_xattr_search s;
+	struct buffer_head *bh;
+};
+
+static int
+ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
+		      struct ext4_xattr_block_find *bs)
+{
+	struct super_block *sb = inode->i_sb;
+	int error;
+
+	ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
+		  i->name_index, i->name, i->value, (long)i->value_len);
+
+	if (EXT4_I(inode)->i_file_acl) {
+		/* The inode already has an extended attribute block. */
+		bs->bh = sb_bread(sb, EXT4_I(inode)->i_file_acl);
+		error = -EIO;
+		if (!bs->bh)
+			goto cleanup;
+		ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
+			atomic_read(&(bs->bh->b_count)),
+			le32_to_cpu(BHDR(bs->bh)->h_refcount));
+		if (ext4_xattr_check_block(bs->bh)) {
+			ext4_error(sb, __FUNCTION__,
+				"inode %lu: bad block %llu", inode->i_ino,
+				EXT4_I(inode)->i_file_acl);
+			error = -EIO;
+			goto cleanup;
+		}
+		/* Find the named attribute. */
+		bs->s.base = BHDR(bs->bh);
+		bs->s.first = BFIRST(bs->bh);
+		bs->s.end = bs->bh->b_data + bs->bh->b_size;
+		bs->s.here = bs->s.first;
+		error = ext4_xattr_find_entry(&bs->s.here, i->name_index,
+					      i->name, bs->bh->b_size, 1);
+		if (error && error != -ENODATA)
+			goto cleanup;
+		bs->s.not_found = error;
+	}
+	error = 0;
+
+cleanup:
+	return error;
+}
+
+static int
+ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+		     struct ext4_xattr_info *i,
+		     struct ext4_xattr_block_find *bs)
+{
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *new_bh = NULL;
+	struct ext4_xattr_search *s = &bs->s;
+	struct mb_cache_entry *ce = NULL;
+	int error;
+
+#define header(x) ((struct ext4_xattr_header *)(x))
+
+	if (i->value && i->value_len > sb->s_blocksize)
+		return -ENOSPC;
+	if (s->base) {
+		ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
+					bs->bh->b_blocknr);
+		if (header(s->base)->h_refcount == cpu_to_le32(1)) {
+			if (ce) {
+				mb_cache_entry_free(ce);
+				ce = NULL;
+			}
+			ea_bdebug(bs->bh, "modifying in-place");
+			error = ext4_journal_get_write_access(handle, bs->bh);
+			if (error)
+				goto cleanup;
+			lock_buffer(bs->bh);
+			error = ext4_xattr_set_entry(i, s);
+			if (!error) {
+				if (!IS_LAST_ENTRY(s->first))
+					ext4_xattr_rehash(header(s->base),
+							  s->here);
+				ext4_xattr_cache_insert(bs->bh);
+			}
+			unlock_buffer(bs->bh);
+			if (error == -EIO)
+				goto bad_block;
+			if (!error)
+				error = ext4_journal_dirty_metadata(handle,
+								    bs->bh);
+			if (error)
+				goto cleanup;
+			goto inserted;
+		} else {
+			int offset = (char *)s->here - bs->bh->b_data;
+
+			if (ce) {
+				mb_cache_entry_release(ce);
+				ce = NULL;
+			}
+			ea_bdebug(bs->bh, "cloning");
+			s->base = kmalloc(bs->bh->b_size, GFP_KERNEL);
+			error = -ENOMEM;
+			if (s->base == NULL)
+				goto cleanup;
+			memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
+			s->first = ENTRY(header(s->base)+1);
+			header(s->base)->h_refcount = cpu_to_le32(1);
+			s->here = ENTRY(s->base + offset);
+			s->end = s->base + bs->bh->b_size;
+		}
+	} else {
+		/* Allocate a buffer where we construct the new block. */
+		s->base = kmalloc(sb->s_blocksize, GFP_KERNEL);
+		/* assert(header == s->base) */
+		error = -ENOMEM;
+		if (s->base == NULL)
+			goto cleanup;
+		memset(s->base, 0, sb->s_blocksize);
+		header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+		header(s->base)->h_blocks = cpu_to_le32(1);
+		header(s->base)->h_refcount = cpu_to_le32(1);
+		s->first = ENTRY(header(s->base)+1);
+		s->here = ENTRY(header(s->base)+1);
+		s->end = s->base + sb->s_blocksize;
+	}
+
+	error = ext4_xattr_set_entry(i, s);
+	if (error == -EIO)
+		goto bad_block;
+	if (error)
+		goto cleanup;
+	if (!IS_LAST_ENTRY(s->first))
+		ext4_xattr_rehash(header(s->base), s->here);
+
+inserted:
+	if (!IS_LAST_ENTRY(s->first)) {
+		new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce);
+		if (new_bh) {
+			/* We found an identical block in the cache. */
+			if (new_bh == bs->bh)
+				ea_bdebug(new_bh, "keeping");
+			else {
+				/* The old block is released after updating
+				   the inode. */
+				error = -EDQUOT;
+				if (DQUOT_ALLOC_BLOCK(inode, 1))
+					goto cleanup;
+				error = ext4_journal_get_write_access(handle,
+								      new_bh);
+				if (error)
+					goto cleanup_dquot;
+				lock_buffer(new_bh);
+				BHDR(new_bh)->h_refcount = cpu_to_le32(1 +
+					le32_to_cpu(BHDR(new_bh)->h_refcount));
+				ea_bdebug(new_bh, "reusing; refcount now=%d",
+					le32_to_cpu(BHDR(new_bh)->h_refcount));
+				unlock_buffer(new_bh);
+				error = ext4_journal_dirty_metadata(handle,
+								    new_bh);
+				if (error)
+					goto cleanup_dquot;
+			}
+			mb_cache_entry_release(ce);
+			ce = NULL;
+		} else if (bs->bh && s->base == bs->bh->b_data) {
+			/* We were modifying this block in-place. */
+			ea_bdebug(bs->bh, "keeping this block");
+			new_bh = bs->bh;
+			get_bh(new_bh);
+		} else {
+			/* We need to allocate a new block */
+			ext4_fsblk_t goal = le32_to_cpu(
+					EXT4_SB(sb)->s_es->s_first_data_block) +
+				(ext4_fsblk_t)EXT4_I(inode)->i_block_group *
+				EXT4_BLOCKS_PER_GROUP(sb);
+			ext4_fsblk_t block = ext4_new_block(handle, inode,
+							goal, &error);
+			if (error)
+				goto cleanup;
+			ea_idebug(inode, "creating block %d", block);
+
+			new_bh = sb_getblk(sb, block);
+			if (!new_bh) {
+getblk_failed:
+				ext4_free_blocks(handle, inode, block, 1);
+				error = -EIO;
+				goto cleanup;
+			}
+			lock_buffer(new_bh);
+			error = ext4_journal_get_create_access(handle, new_bh);
+			if (error) {
+				unlock_buffer(new_bh);
+				goto getblk_failed;
+			}
+			memcpy(new_bh->b_data, s->base, new_bh->b_size);
+			set_buffer_uptodate(new_bh);
+			unlock_buffer(new_bh);
+			ext4_xattr_cache_insert(new_bh);
+			error = ext4_journal_dirty_metadata(handle, new_bh);
+			if (error)
+				goto cleanup;
+		}
+	}
+
+	/* Update the inode. */
+	EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
+
+	/* Drop the previous xattr block. */
+	if (bs->bh && bs->bh != new_bh)
+		ext4_xattr_release_block(handle, inode, bs->bh);
+	error = 0;
+
+cleanup:
+	if (ce)
+		mb_cache_entry_release(ce);
+	brelse(new_bh);
+	if (!(bs->bh && s->base == bs->bh->b_data))
+		kfree(s->base);
+
+	return error;
+
+cleanup_dquot:
+	DQUOT_FREE_BLOCK(inode, 1);
+	goto cleanup;
+
+bad_block:
+	ext4_error(inode->i_sb, __FUNCTION__,
+		   "inode %lu: bad block %llu", inode->i_ino,
+		   EXT4_I(inode)->i_file_acl);
+	goto cleanup;
+
+#undef header
+}
+
+struct ext4_xattr_ibody_find {
+	struct ext4_xattr_search s;
+	struct ext4_iloc iloc;
+};
+
+static int
+ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+		      struct ext4_xattr_ibody_find *is)
+{
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_inode *raw_inode;
+	int error;
+
+	if (EXT4_I(inode)->i_extra_isize == 0)
+		return 0;
+	raw_inode = ext4_raw_inode(&is->iloc);
+	header = IHDR(inode, raw_inode);
+	is->s.base = is->s.first = IFIRST(header);
+	is->s.here = is->s.first;
+	is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+	if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+		error = ext4_xattr_check_names(IFIRST(header), is->s.end);
+		if (error)
+			return error;
+		/* Find the named attribute. */
+		error = ext4_xattr_find_entry(&is->s.here, i->name_index,
+					      i->name, is->s.end -
+					      (void *)is->s.base, 0);
+		if (error && error != -ENODATA)
+			return error;
+		is->s.not_found = error;
+	}
+	return 0;
+}
+
+static int
+ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+		     struct ext4_xattr_info *i,
+		     struct ext4_xattr_ibody_find *is)
+{
+	struct ext4_xattr_ibody_header *header;
+	struct ext4_xattr_search *s = &is->s;
+	int error;
+
+	if (EXT4_I(inode)->i_extra_isize == 0)
+		return -ENOSPC;
+	error = ext4_xattr_set_entry(i, s);
+	if (error)
+		return error;
+	header = IHDR(inode, ext4_raw_inode(&is->iloc));
+	if (!IS_LAST_ENTRY(s->first)) {
+		header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+		EXT4_I(inode)->i_state |= EXT4_STATE_XATTR;
+	} else {
+		header->h_magic = cpu_to_le32(0);
+		EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR;
+	}
+	return 0;
+}
+
+/*
+ * ext4_xattr_set_handle()
+ *
+ * Create, replace or remove an extended attribute for this inode. Buffer
+ * is NULL to remove an existing extended attribute, and non-NULL to
+ * either replace an existing extended attribute, or create a new extended
+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
+ * specify that an extended attribute must exist and must not exist
+ * previous to the call, respectively.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int
+ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+		      const char *name, const void *value, size_t value_len,
+		      int flags)
+{
+	struct ext4_xattr_info i = {
+		.name_index = name_index,
+		.name = name,
+		.value = value,
+		.value_len = value_len,
+
+	};
+	struct ext4_xattr_ibody_find is = {
+		.s = { .not_found = -ENODATA, },
+	};
+	struct ext4_xattr_block_find bs = {
+		.s = { .not_found = -ENODATA, },
+	};
+	int error;
+
+	if (!name)
+		return -EINVAL;
+	if (strlen(name) > 255)
+		return -ERANGE;
+	down_write(&EXT4_I(inode)->xattr_sem);
+	error = ext4_get_inode_loc(inode, &is.iloc);
+	if (error)
+		goto cleanup;
+
+	if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
+		struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
+		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
+		EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW;
+	}
+
+	error = ext4_xattr_ibody_find(inode, &i, &is);
+	if (error)
+		goto cleanup;
+	if (is.s.not_found)
+		error = ext4_xattr_block_find(inode, &i, &bs);
+	if (error)
+		goto cleanup;
+	if (is.s.not_found && bs.s.not_found) {
+		error = -ENODATA;
+		if (flags & XATTR_REPLACE)
+			goto cleanup;
+		error = 0;
+		if (!value)
+			goto cleanup;
+	} else {
+		error = -EEXIST;
+		if (flags & XATTR_CREATE)
+			goto cleanup;
+	}
+	error = ext4_journal_get_write_access(handle, is.iloc.bh);
+	if (error)
+		goto cleanup;
+	if (!value) {
+		if (!is.s.not_found)
+			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+		else if (!bs.s.not_found)
+			error = ext4_xattr_block_set(handle, inode, &i, &bs);
+	} else {
+		error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+		if (!error && !bs.s.not_found) {
+			i.value = NULL;
+			error = ext4_xattr_block_set(handle, inode, &i, &bs);
+		} else if (error == -ENOSPC) {
+			error = ext4_xattr_block_set(handle, inode, &i, &bs);
+			if (error)
+				goto cleanup;
+			if (!is.s.not_found) {
+				i.value = NULL;
+				error = ext4_xattr_ibody_set(handle, inode, &i,
+							     &is);
+			}
+		}
+	}
+	if (!error) {
+		ext4_xattr_update_super_block(handle, inode->i_sb);
+		inode->i_ctime = CURRENT_TIME_SEC;
+		error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+		/*
+		 * The bh is consumed by ext4_mark_iloc_dirty, even with
+		 * error != 0.
+		 */
+		is.iloc.bh = NULL;
+		if (IS_SYNC(inode))
+			handle->h_sync = 1;
+	}
+
+cleanup:
+	brelse(is.iloc.bh);
+	brelse(bs.bh);
+	up_write(&EXT4_I(inode)->xattr_sem);
+	return error;
+}
+
+/*
+ * ext4_xattr_set()
+ *
+ * Like ext4_xattr_set_handle, but start from an inode. This extended
+ * attribute modification is a filesystem transaction by itself.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+int
+ext4_xattr_set(struct inode *inode, int name_index, const char *name,
+	       const void *value, size_t value_len, int flags)
+{
+	handle_t *handle;
+	int error, retries = 0;
+
+retry:
+	handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+	if (IS_ERR(handle)) {
+		error = PTR_ERR(handle);
+	} else {
+		int error2;
+
+		error = ext4_xattr_set_handle(handle, inode, name_index, name,
+					      value, value_len, flags);
+		error2 = ext4_journal_stop(handle);
+		if (error == -ENOSPC &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry;
+		if (error == 0)
+			error = error2;
+	}
+
+	return error;
+}
+
+/*
+ * ext4_xattr_delete_inode()
+ *
+ * Free extended attribute resources associated with this inode. This
+ * is called immediately before an inode is freed. We have exclusive
+ * access to the inode.
+ */
+void
+ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
+{
+	struct buffer_head *bh = NULL;
+
+	if (!EXT4_I(inode)->i_file_acl)
+		goto cleanup;
+	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+	if (!bh) {
+		ext4_error(inode->i_sb, __FUNCTION__,
+			"inode %lu: block %llu read error", inode->i_ino,
+			EXT4_I(inode)->i_file_acl);
+		goto cleanup;
+	}
+	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
+	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
+		ext4_error(inode->i_sb, __FUNCTION__,
+			"inode %lu: bad block %llu", inode->i_ino,
+			EXT4_I(inode)->i_file_acl);
+		goto cleanup;
+	}
+	ext4_xattr_release_block(handle, inode, bh);
+	EXT4_I(inode)->i_file_acl = 0;
+
+cleanup:
+	brelse(bh);
+}
+
+/*
+ * ext4_xattr_put_super()
+ *
+ * This is called when a file system is unmounted.
+ */
+void
+ext4_xattr_put_super(struct super_block *sb)
+{
+	mb_cache_shrink(sb->s_bdev);
+}
+
+/*
+ * ext4_xattr_cache_insert()
+ *
+ * Create a new entry in the extended attribute cache, and insert
+ * it unless such an entry is already in the cache.
+ *
+ * Returns 0, or a negative error number on failure.
+ */
+static void
+ext4_xattr_cache_insert(struct buffer_head *bh)
+{
+	__u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
+	struct mb_cache_entry *ce;
+	int error;
+
+	ce = mb_cache_entry_alloc(ext4_xattr_cache);
+	if (!ce) {
+		ea_bdebug(bh, "out of memory");
+		return;
+	}
+	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash);
+	if (error) {
+		mb_cache_entry_free(ce);
+		if (error == -EBUSY) {
+			ea_bdebug(bh, "already in cache");
+			error = 0;
+		}
+	} else {
+		ea_bdebug(bh, "inserting [%x]", (int)hash);
+		mb_cache_entry_release(ce);
+	}
+}
+
+/*
+ * ext4_xattr_cmp()
+ *
+ * Compare two extended attribute blocks for equality.
+ *
+ * Returns 0 if the blocks are equal, 1 if they differ, and
+ * a negative error number on errors.
+ */
+static int
+ext4_xattr_cmp(struct ext4_xattr_header *header1,
+	       struct ext4_xattr_header *header2)
+{
+	struct ext4_xattr_entry *entry1, *entry2;
+
+	entry1 = ENTRY(header1+1);
+	entry2 = ENTRY(header2+1);
+	while (!IS_LAST_ENTRY(entry1)) {
+		if (IS_LAST_ENTRY(entry2))
+			return 1;
+		if (entry1->e_hash != entry2->e_hash ||
+		    entry1->e_name_index != entry2->e_name_index ||
+		    entry1->e_name_len != entry2->e_name_len ||
+		    entry1->e_value_size != entry2->e_value_size ||
+		    memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
+			return 1;
+		if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
+			return -EIO;
+		if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
+			   (char *)header2 + le16_to_cpu(entry2->e_value_offs),
+			   le32_to_cpu(entry1->e_value_size)))
+			return 1;
+
+		entry1 = EXT4_XATTR_NEXT(entry1);
+		entry2 = EXT4_XATTR_NEXT(entry2);
+	}
+	if (!IS_LAST_ENTRY(entry2))
+		return 1;
+	return 0;
+}
+
+/*
+ * ext4_xattr_cache_find()
+ *
+ * Find an identical extended attribute block.
+ *
+ * Returns a pointer to the block found, or NULL if such a block was
+ * not found or an error occurred.
+ */
+static struct buffer_head *
+ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
+		      struct mb_cache_entry **pce)
+{
+	__u32 hash = le32_to_cpu(header->h_hash);
+	struct mb_cache_entry *ce;
+
+	if (!header->h_hash)
+		return NULL;  /* never share */
+	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
+again:
+	ce = mb_cache_entry_find_first(ext4_xattr_cache, 0,
+				       inode->i_sb->s_bdev, hash);
+	while (ce) {
+		struct buffer_head *bh;
+
+		if (IS_ERR(ce)) {
+			if (PTR_ERR(ce) == -EAGAIN)
+				goto again;
+			break;
+		}
+		bh = sb_bread(inode->i_sb, ce->e_block);
+		if (!bh) {
+			ext4_error(inode->i_sb, __FUNCTION__,
+				"inode %lu: block %lu read error",
+				inode->i_ino, (unsigned long) ce->e_block);
+		} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
+				EXT4_XATTR_REFCOUNT_MAX) {
+			ea_idebug(inode, "block %lu refcount %d>=%d",
+				  (unsigned long) ce->e_block,
+				  le32_to_cpu(BHDR(bh)->h_refcount),
+					  EXT4_XATTR_REFCOUNT_MAX);
+		} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
+			*pce = ce;
+			return bh;
+		}
+		brelse(bh);
+		ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
+	}
+	return NULL;
+}
+
+#define NAME_HASH_SHIFT 5
+#define VALUE_HASH_SHIFT 16
+
+/*
+ * ext4_xattr_hash_entry()
+ *
+ * Compute the hash of an extended attribute.
+ */
+static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
+					 struct ext4_xattr_entry *entry)
+{
+	__u32 hash = 0;
+	char *name = entry->e_name;
+	int n;
+
+	for (n=0; n < entry->e_name_len; n++) {
+		hash = (hash << NAME_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
+		       *name++;
+	}
+
+	if (entry->e_value_block == 0 && entry->e_value_size != 0) {
+		__le32 *value = (__le32 *)((char *)header +
+			le16_to_cpu(entry->e_value_offs));
+		for (n = (le32_to_cpu(entry->e_value_size) +
+		     EXT4_XATTR_ROUND) >> EXT4_XATTR_PAD_BITS; n; n--) {
+			hash = (hash << VALUE_HASH_SHIFT) ^
+			       (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
+			       le32_to_cpu(*value++);
+		}
+	}
+	entry->e_hash = cpu_to_le32(hash);
+}
+
+#undef NAME_HASH_SHIFT
+#undef VALUE_HASH_SHIFT
+
+#define BLOCK_HASH_SHIFT 16
+
+/*
+ * ext4_xattr_rehash()
+ *
+ * Re-compute the extended attribute hash value after an entry has changed.
+ */
+static void ext4_xattr_rehash(struct ext4_xattr_header *header,
+			      struct ext4_xattr_entry *entry)
+{
+	struct ext4_xattr_entry *here;
+	__u32 hash = 0;
+
+	ext4_xattr_hash_entry(header, entry);
+	here = ENTRY(header+1);
+	while (!IS_LAST_ENTRY(here)) {
+		if (!here->e_hash) {
+			/* Block is not shared if an entry's hash value == 0 */
+			hash = 0;
+			break;
+		}
+		hash = (hash << BLOCK_HASH_SHIFT) ^
+		       (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
+		       le32_to_cpu(here->e_hash);
+		here = EXT4_XATTR_NEXT(here);
+	}
+	header->h_hash = cpu_to_le32(hash);
+}
+
+#undef BLOCK_HASH_SHIFT
+
+int __init
+init_ext4_xattr(void)
+{
+	ext4_xattr_cache = mb_cache_create("ext4_xattr", NULL,
+		sizeof(struct mb_cache_entry) +
+		sizeof(((struct mb_cache_entry *) 0)->e_indexes[0]), 1, 6);
+	if (!ext4_xattr_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+exit_ext4_xattr(void)
+{
+	if (ext4_xattr_cache)
+		mb_cache_destroy(ext4_xattr_cache);
+	ext4_xattr_cache = NULL;
+}
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
new file mode 100644
index 000000000000..79432b35398f
--- /dev/null
+++ b/fs/ext4/xattr.h
@@ -0,0 +1,145 @@
+/*
+  File: fs/ext4/xattr.h
+
+  On-disk format of extended attributes for the ext4 filesystem.
+
+  (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+*/
+
+#include <linux/xattr.h>
+
+/* Magic value in attribute blocks */
+#define EXT4_XATTR_MAGIC		0xEA020000
+
+/* Maximum number of references to one attribute block */
+#define EXT4_XATTR_REFCOUNT_MAX		1024
+
+/* Name indexes */
+#define EXT4_XATTR_INDEX_USER			1
+#define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS	2
+#define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT	3
+#define EXT4_XATTR_INDEX_TRUSTED		4
+#define	EXT4_XATTR_INDEX_LUSTRE			5
+#define EXT4_XATTR_INDEX_SECURITY	        6
+
+struct ext4_xattr_header {
+	__le32	h_magic;	/* magic number for identification */
+	__le32	h_refcount;	/* reference count */
+	__le32	h_blocks;	/* number of disk blocks used */
+	__le32	h_hash;		/* hash value of all attributes */
+	__u32	h_reserved[4];	/* zero right now */
+};
+
+struct ext4_xattr_ibody_header {
+	__le32	h_magic;	/* magic number for identification */
+};
+
+struct ext4_xattr_entry {
+	__u8	e_name_len;	/* length of name */
+	__u8	e_name_index;	/* attribute name index */
+	__le16	e_value_offs;	/* offset in disk block of value */
+	__le32	e_value_block;	/* disk block attribute is stored on (n/i) */
+	__le32	e_value_size;	/* size of attribute value */
+	__le32	e_hash;		/* hash value of name and value */
+	char	e_name[0];	/* attribute name */
+};
+
+#define EXT4_XATTR_PAD_BITS		2
+#define EXT4_XATTR_PAD		(1<<EXT4_XATTR_PAD_BITS)
+#define EXT4_XATTR_ROUND		(EXT4_XATTR_PAD-1)
+#define EXT4_XATTR_LEN(name_len) \
+	(((name_len) + EXT4_XATTR_ROUND + \
+	sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
+#define EXT4_XATTR_NEXT(entry) \
+	( (struct ext4_xattr_entry *)( \
+	  (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)) )
+#define EXT4_XATTR_SIZE(size) \
+	(((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
+
+# ifdef CONFIG_EXT4DEV_FS_XATTR
+
+extern struct xattr_handler ext4_xattr_user_handler;
+extern struct xattr_handler ext4_xattr_trusted_handler;
+extern struct xattr_handler ext4_xattr_acl_access_handler;
+extern struct xattr_handler ext4_xattr_acl_default_handler;
+extern struct xattr_handler ext4_xattr_security_handler;
+
+extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
+
+extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
+extern int ext4_xattr_list(struct inode *, char *, size_t);
+extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
+extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
+
+extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
+extern void ext4_xattr_put_super(struct super_block *);
+
+extern int init_ext4_xattr(void);
+extern void exit_ext4_xattr(void);
+
+extern struct xattr_handler *ext4_xattr_handlers[];
+
+# else  /* CONFIG_EXT4DEV_FS_XATTR */
+
+static inline int
+ext4_xattr_get(struct inode *inode, int name_index, const char *name,
+	       void *buffer, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int
+ext4_xattr_list(struct inode *inode, void *buffer, size_t size)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int
+ext4_xattr_set(struct inode *inode, int name_index, const char *name,
+	       const void *value, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int
+ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+	       const char *name, const void *value, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void
+ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
+{
+}
+
+static inline void
+ext4_xattr_put_super(struct super_block *sb)
+{
+}
+
+static inline int
+init_ext4_xattr(void)
+{
+	return 0;
+}
+
+static inline void
+exit_ext4_xattr(void)
+{
+}
+
+#define ext4_xattr_handlers	NULL
+
+# endif  /* CONFIG_EXT4DEV_FS_XATTR */
+
+#ifdef CONFIG_EXT4DEV_FS_SECURITY
+extern int ext4_init_security(handle_t *handle, struct inode *inode,
+				struct inode *dir);
+#else
+static inline int ext4_init_security(handle_t *handle, struct inode *inode,
+				struct inode *dir)
+{
+	return 0;
+}
+#endif
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
new file mode 100644
index 000000000000..b6a6861951f9
--- /dev/null
+++ b/fs/ext4/xattr_security.c
@@ -0,0 +1,77 @@
+/*
+ * linux/fs/ext4/xattr_security.c
+ * Handler for storing security labels as extended attributes.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include <linux/security.h>
+#include "xattr.h"
+
+static size_t
+ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size,
+			 const char *name, size_t name_len)
+{
+	const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
+	const size_t total_len = prefix_len + name_len + 1;
+
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext4_xattr_security_get(struct inode *inode, const char *name,
+		       void *buffer, size_t size)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, name,
+			      buffer, size);
+}
+
+static int
+ext4_xattr_security_set(struct inode *inode, const char *name,
+		       const void *value, size_t size, int flags)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name,
+			      value, size, flags);
+}
+
+int
+ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
+{
+	int err;
+	size_t len;
+	void *value;
+	char *name;
+
+	err = security_inode_init_security(inode, dir, &name, &value, &len);
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+	err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY,
+				    name, value, len, 0);
+	kfree(name);
+	kfree(value);
+	return err;
+}
+
+struct xattr_handler ext4_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= ext4_xattr_security_list,
+	.get	= ext4_xattr_security_get,
+	.set	= ext4_xattr_security_set,
+};
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
new file mode 100644
index 000000000000..b76f2dbc82da
--- /dev/null
+++ b/fs/ext4/xattr_trusted.c
@@ -0,0 +1,62 @@
+/*
+ * linux/fs/ext4/xattr_trusted.c
+ * Handler for trusted extended attributes.
+ *
+ * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include "xattr.h"
+
+#define XATTR_TRUSTED_PREFIX "trusted."
+
+static size_t
+ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
+			const char *name, size_t name_len)
+{
+	const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext4_xattr_trusted_get(struct inode *inode, const char *name,
+		       void *buffer, size_t size)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, name,
+			      buffer, size);
+}
+
+static int
+ext4_xattr_trusted_set(struct inode *inode, const char *name,
+		       const void *value, size_t size, int flags)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name,
+			      value, size, flags);
+}
+
+struct xattr_handler ext4_xattr_trusted_handler = {
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= ext4_xattr_trusted_list,
+	.get	= ext4_xattr_trusted_get,
+	.set	= ext4_xattr_trusted_set,
+};
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
new file mode 100644
index 000000000000..c53cded0761a
--- /dev/null
+++ b/fs/ext4/xattr_user.c
@@ -0,0 +1,64 @@
+/*
+ * linux/fs/ext4/xattr_user.c
+ * Handler for extended user attributes.
+ *
+ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/ext4_jbd2.h>
+#include <linux/ext4_fs.h>
+#include "xattr.h"
+
+#define XATTR_USER_PREFIX "user."
+
+static size_t
+ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
+		     const char *name, size_t name_len)
+{
+	const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (!test_opt(inode->i_sb, XATTR_USER))
+		return 0;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_USER_PREFIX, prefix_len);
+		memcpy(list+prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int
+ext4_xattr_user_get(struct inode *inode, const char *name,
+		    void *buffer, size_t size)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (!test_opt(inode->i_sb, XATTR_USER))
+		return -EOPNOTSUPP;
+	return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER, name, buffer, size);
+}
+
+static int
+ext4_xattr_user_set(struct inode *inode, const char *name,
+		    const void *value, size_t size, int flags)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	if (!test_opt(inode->i_sb, XATTR_USER))
+		return -EOPNOTSUPP;
+	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name,
+			      value, size, flags);
+}
+
+struct xattr_handler ext4_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= ext4_xattr_user_list,
+	.get	= ext4_xattr_user_get,
+	.set	= ext4_xattr_user_set,
+};
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 97b967b84fc6..82cc4f59e3ba 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -58,8 +58,7 @@ int __init fat_cache_init(void)
 
 void fat_cache_destroy(void)
 {
-	if (kmem_cache_destroy(fat_cache_cachep))
-		printk(KERN_INFO "fat_cache: not all structures were freed\n");
+	kmem_cache_destroy(fat_cache_cachep);
 }
 
 static inline struct fat_cache *fat_cache_alloc(struct inode *inode)
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 698b85bb1dd4..69c439f44387 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -20,6 +20,7 @@
 #include <linux/dirent.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/compat.h>
 #include <asm/uaccess.h>
 
 static inline loff_t fat_make_i_pos(struct super_block *sb,
@@ -647,7 +648,7 @@ static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir)
 }
 
 static int fat_ioctl_filldir(void *__buf, const char *name, int name_len,
-			     loff_t offset, ino_t ino, unsigned int d_type)
+			     loff_t offset, u64 ino, unsigned int d_type)
 {
 	struct fat_ioctl_filldir_callback *buf = __buf;
 	struct dirent __user *d1 = buf->dirent;
@@ -741,10 +742,65 @@ static int fat_dir_ioctl(struct inode * inode, struct file * filp,
 	return ret;
 }
 
+#ifdef CONFIG_COMPAT
+#define	VFAT_IOCTL_READDIR_BOTH32	_IOR('r', 1, struct compat_dirent[2])
+#define	VFAT_IOCTL_READDIR_SHORT32	_IOR('r', 2, struct compat_dirent[2])
+
+static long fat_compat_put_dirent32(struct dirent *d,
+				    struct compat_dirent __user *d32)
+{
+        if (!access_ok(VERIFY_WRITE, d32, sizeof(struct compat_dirent)))
+                return -EFAULT;
+
+        __put_user(d->d_ino, &d32->d_ino);
+        __put_user(d->d_off, &d32->d_off);
+        __put_user(d->d_reclen, &d32->d_reclen);
+        if (__copy_to_user(d32->d_name, d->d_name, d->d_reclen))
+		return -EFAULT;
+
+        return 0;
+}
+
+static long fat_compat_dir_ioctl(struct file *file, unsigned cmd,
+				 unsigned long arg)
+{
+	struct compat_dirent __user *p = compat_ptr(arg);
+	int ret;
+	mm_segment_t oldfs = get_fs();
+	struct dirent d[2];
+
+	switch (cmd) {
+	case VFAT_IOCTL_READDIR_BOTH32:
+		cmd = VFAT_IOCTL_READDIR_BOTH;
+		break;
+	case VFAT_IOCTL_READDIR_SHORT32:
+		cmd = VFAT_IOCTL_READDIR_SHORT;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+
+	set_fs(KERNEL_DS);
+	lock_kernel();
+	ret = fat_dir_ioctl(file->f_dentry->d_inode, file,
+			    cmd, (unsigned long) &d);
+	unlock_kernel();
+	set_fs(oldfs);
+	if (ret >= 0) {
+		ret |= fat_compat_put_dirent32(&d[0], p);
+		ret |= fat_compat_put_dirent32(&d[1], p + 1);
+	}
+	return ret;
+}
+#endif /* CONFIG_COMPAT */
+
 const struct file_operations fat_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= fat_readdir,
 	.ioctl		= fat_dir_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= fat_compat_dir_ioctl,
+#endif
 	.fsync		= file_fsync,
 };
 
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 1ee25232e6af..8337451e7897 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -13,6 +13,8 @@
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
 
 int fat_generic_ioctl(struct inode *inode, struct file *filp,
 		      unsigned int cmd, unsigned long arg)
@@ -112,15 +114,24 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
 	}
 }
 
+static int fat_file_release(struct inode *inode, struct file *filp)
+{
+	if ((filp->f_mode & FMODE_WRITE) &&
+	     MSDOS_SB(inode->i_sb)->options.flush) {
+		fat_flush_inodes(inode->i_sb, inode, NULL);
+		congestion_wait(WRITE, HZ/10);
+	}
+	return 0;
+}
+
 const struct file_operations fat_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
-	.readv		= generic_file_readv,
-	.writev		= generic_file_writev,
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
+	.release	= fat_file_release,
 	.ioctl		= fat_generic_ioctl,
 	.fsync		= file_fsync,
 	.sendfile	= generic_file_sendfile,
@@ -289,6 +300,7 @@ void fat_truncate(struct inode *inode)
 	lock_kernel();
 	fat_free(inode, nr_clusters);
 	unlock_kernel();
+	fat_flush_inodes(inode->i_sb, inode, NULL);
 }
 
 struct inode_operations fat_file_inode_operations = {
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 31b7174176ba..78945b53b0f8 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -24,6 +24,7 @@
 #include <linux/vfs.h>
 #include <linux/parser.h>
 #include <linux/uio.h>
+#include <linux/writeback.h>
 #include <asm/unaligned.h>
 
 #ifndef CONFIG_FAT_DEFAULT_IOCHARSET
@@ -50,14 +51,14 @@ static int fat_add_cluster(struct inode *inode)
 	return err;
 }
 
-static int __fat_get_blocks(struct inode *inode, sector_t iblock,
-			    unsigned long *max_blocks,
-			    struct buffer_head *bh_result, int create)
+static inline int __fat_get_block(struct inode *inode, sector_t iblock,
+				  unsigned long *max_blocks,
+				  struct buffer_head *bh_result, int create)
 {
 	struct super_block *sb = inode->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
-	sector_t phys;
 	unsigned long mapped_blocks;
+	sector_t phys;
 	int err, offset;
 
 	err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
@@ -73,7 +74,7 @@ static int __fat_get_blocks(struct inode *inode, sector_t iblock,
 
 	if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) {
 		fat_fs_panic(sb, "corrupted file size (i_pos %lld, %lld)",
-			     MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private);
+			MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private);
 		return -EIO;
 	}
 
@@ -93,34 +94,29 @@ static int __fat_get_blocks(struct inode *inode, sector_t iblock,
 	err = fat_bmap(inode, iblock, &phys, &mapped_blocks);
 	if (err)
 		return err;
+
 	BUG_ON(!phys);
 	BUG_ON(*max_blocks != mapped_blocks);
 	set_buffer_new(bh_result);
 	map_bh(bh_result, sb, phys);
+
 	return 0;
 }
 
-static int fat_get_blocks(struct inode *inode, sector_t iblock,
-			  struct buffer_head *bh_result, int create)
+static int fat_get_block(struct inode *inode, sector_t iblock,
+			 struct buffer_head *bh_result, int create)
 {
 	struct super_block *sb = inode->i_sb;
-	int err;
 	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+	int err;
 
-	err = __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create);
+	err = __fat_get_block(inode, iblock, &max_blocks, bh_result, create);
 	if (err)
 		return err;
 	bh_result->b_size = max_blocks << sb->s_blocksize_bits;
 	return 0;
 }
 
-static int fat_get_block(struct inode *inode, sector_t iblock,
-			 struct buffer_head *bh_result, int create)
-{
-	unsigned long max_blocks = 1;
-	return __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create);
-}
-
 static int fat_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, fat_get_block, wbc);
@@ -188,7 +184,7 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
 	 * condition of fat_get_block() and ->truncate().
 	 */
 	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				  offset, nr_segs, fat_get_blocks, NULL);
+				  offset, nr_segs, fat_get_block, NULL);
 }
 
 static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
@@ -375,8 +371,6 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 			inode->i_flags |= S_IMMUTABLE;
 	}
 	MSDOS_I(inode)->i_attrs = de->attr & ATTR_UNUSED;
-	/* this is as close to the truth as we can get ... */
-	inode->i_blksize = sbi->cluster_size;
 	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
 			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
 	inode->i_mtime.tv_sec =
@@ -390,7 +384,7 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 				      le16_to_cpu(de->cdate)) + secs;
 		inode->i_ctime.tv_nsec = csecs * 10000000;
 		inode->i_atime.tv_sec =
-			date_dos2unix(le16_to_cpu(0), le16_to_cpu(de->adate));
+			date_dos2unix(0, le16_to_cpu(de->adate));
 		inode->i_atime.tv_nsec = 0;
 	} else
 		inode->i_ctime = inode->i_atime = inode->i_mtime;
@@ -528,8 +522,7 @@ static int __init fat_init_inodecache(void)
 
 static void __exit fat_destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(fat_inode_cachep))
-		printk(KERN_INFO "fat_inode_cache: not all structures were freed\n");
+	kmem_cache_destroy(fat_inode_cachep);
 }
 
 static int fat_remount(struct super_block *sb, int *flags, char *data)
@@ -861,7 +854,7 @@ enum {
 	Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
 	Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
 	Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
-	Opt_obsolate, Opt_err,
+	Opt_obsolate, Opt_flush, Opt_err,
 };
 
 static match_table_t fat_tokens = {
@@ -893,7 +886,8 @@ static match_table_t fat_tokens = {
 	{Opt_obsolate, "cvf_format=%20s"},
 	{Opt_obsolate, "cvf_options=%100s"},
 	{Opt_obsolate, "posix"},
-	{Opt_err, NULL}
+	{Opt_flush, "flush"},
+	{Opt_err, NULL},
 };
 static match_table_t msdos_tokens = {
 	{Opt_nodots, "nodots"},
@@ -1034,6 +1028,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 				return 0;
 			opts->codepage = option;
 			break;
+		case Opt_flush:
+			opts->flush = 1;
+			break;
 
 		/* msdos specific */
 		case Opt_dots:
@@ -1137,7 +1134,6 @@ static int fat_read_root(struct inode *inode)
 		MSDOS_I(inode)->i_start = 0;
 		inode->i_size = sbi->dir_entries * sizeof(struct msdos_dir_entry);
 	}
-	inode->i_blksize = sbi->cluster_size;
 	inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
 			   & ~((loff_t)sbi->cluster_size - 1)) >> 9;
 	MSDOS_I(inode)->i_logstart = 0;
@@ -1168,11 +1164,10 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	long error;
 	char buf[50];
 
-	sbi = kmalloc(sizeof(struct msdos_sb_info), GFP_KERNEL);
+	sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
 	sb->s_fs_info = sbi;
-	memset(sbi, 0, sizeof(struct msdos_sb_info));
 
 	sb->s_flags |= MS_NODIRATIME;
 	sb->s_magic = MSDOS_SUPER_MAGIC;
@@ -1435,6 +1430,56 @@ out_fail:
 
 EXPORT_SYMBOL_GPL(fat_fill_super);
 
+/*
+ * helper function for fat_flush_inodes.  This writes both the inode
+ * and the file data blocks, waiting for in flight data blocks before
+ * the start of the call.  It does not wait for any io started
+ * during the call
+ */
+static int writeback_inode(struct inode *inode)
+{
+
+	int ret;
+	struct address_space *mapping = inode->i_mapping;
+	struct writeback_control wbc = {
+	       .sync_mode = WB_SYNC_NONE,
+	      .nr_to_write = 0,
+	};
+	/* if we used WB_SYNC_ALL, sync_inode waits for the io for the
+	* inode to finish.  So WB_SYNC_NONE is sent down to sync_inode
+	* and filemap_fdatawrite is used for the data blocks
+	*/
+	ret = sync_inode(inode, &wbc);
+	if (!ret)
+	       ret = filemap_fdatawrite(mapping);
+	return ret;
+}
+
+/*
+ * write data and metadata corresponding to i1 and i2.  The io is
+ * started but we do not wait for any of it to finish.
+ *
+ * filemap_flush is used for the block device, so if there is a dirty
+ * page for a block already in flight, we will not wait and start the
+ * io over again
+ */
+int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2)
+{
+	int ret = 0;
+	if (!MSDOS_SB(sb)->options.flush)
+		return 0;
+	if (i1)
+		ret = writeback_inode(i1);
+	if (!ret && i2)
+		ret = writeback_inode(i2);
+	if (!ret) {
+		struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+		ret = filemap_flush(mapping);
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(fat_flush_inodes);
+
 static int __init init_fat_fs(void)
 {
 	int err;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index d35cbc6bc112..e4f26165f12a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -250,19 +250,22 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 	return error;
 }
 
-static void f_modown(struct file *filp, unsigned long pid,
+static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
                      uid_t uid, uid_t euid, int force)
 {
 	write_lock_irq(&filp->f_owner.lock);
 	if (force || !filp->f_owner.pid) {
-		filp->f_owner.pid = pid;
+		put_pid(filp->f_owner.pid);
+		filp->f_owner.pid = get_pid(pid);
+		filp->f_owner.pid_type = type;
 		filp->f_owner.uid = uid;
 		filp->f_owner.euid = euid;
 	}
 	write_unlock_irq(&filp->f_owner.lock);
 }
 
-int f_setown(struct file *filp, unsigned long arg, int force)
+int __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
+		int force)
 {
 	int err;
 	
@@ -270,15 +273,44 @@ int f_setown(struct file *filp, unsigned long arg, int force)
 	if (err)
 		return err;
 
-	f_modown(filp, arg, current->uid, current->euid, force);
+	f_modown(filp, pid, type, current->uid, current->euid, force);
 	return 0;
 }
+EXPORT_SYMBOL(__f_setown);
 
+int f_setown(struct file *filp, unsigned long arg, int force)
+{
+	enum pid_type type;
+	struct pid *pid;
+	int who = arg;
+	int result;
+	type = PIDTYPE_PID;
+	if (who < 0) {
+		type = PIDTYPE_PGID;
+		who = -who;
+	}
+	rcu_read_lock();
+	pid = find_pid(who);
+	result = __f_setown(filp, pid, type, force);
+	rcu_read_unlock();
+	return result;
+}
 EXPORT_SYMBOL(f_setown);
 
 void f_delown(struct file *filp)
 {
-	f_modown(filp, 0, 0, 0, 1);
+	f_modown(filp, NULL, PIDTYPE_PID, 0, 0, 1);
+}
+
+pid_t f_getown(struct file *filp)
+{
+	pid_t pid;
+	read_lock(&filp->f_owner.lock);
+	pid = pid_nr(filp->f_owner.pid);
+	if (filp->f_owner.pid_type == PIDTYPE_PGID)
+		pid = -pid;
+	read_unlock(&filp->f_owner.lock);
+	return pid;
 }
 
 static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
@@ -319,7 +351,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 		 * current syscall conventions, the only way
 		 * to fix this will be in libc.
 		 */
-		err = filp->f_owner.pid;
+		err = f_getown(filp);
 		force_successful_syscall_return();
 		break;
 	case F_SETOWN:
@@ -470,24 +502,19 @@ static void send_sigio_to_task(struct task_struct *p,
 void send_sigio(struct fown_struct *fown, int fd, int band)
 {
 	struct task_struct *p;
-	int pid;
+	enum pid_type type;
+	struct pid *pid;
 	
 	read_lock(&fown->lock);
+	type = fown->pid_type;
 	pid = fown->pid;
 	if (!pid)
 		goto out_unlock_fown;
 	
 	read_lock(&tasklist_lock);
-	if (pid > 0) {
-		p = find_task_by_pid(pid);
-		if (p) {
-			send_sigio_to_task(p, fown, fd, band);
-		}
-	} else {
-		do_each_task_pid(-pid, PIDTYPE_PGID, p) {
-			send_sigio_to_task(p, fown, fd, band);
-		} while_each_task_pid(-pid, PIDTYPE_PGID, p);
-	}
+	do_each_pid_task(pid, type, p) {
+		send_sigio_to_task(p, fown, fd, band);
+	} while_each_pid_task(pid, type, p);
 	read_unlock(&tasklist_lock);
  out_unlock_fown:
 	read_unlock(&fown->lock);
@@ -503,9 +530,12 @@ static void send_sigurg_to_task(struct task_struct *p,
 int send_sigurg(struct fown_struct *fown)
 {
 	struct task_struct *p;
-	int pid, ret = 0;
+	enum pid_type type;
+	struct pid *pid;
+	int ret = 0;
 	
 	read_lock(&fown->lock);
+	type = fown->pid_type;
 	pid = fown->pid;
 	if (!pid)
 		goto out_unlock_fown;
@@ -513,16 +543,9 @@ int send_sigurg(struct fown_struct *fown)
 	ret = 1;
 	
 	read_lock(&tasklist_lock);
-	if (pid > 0) {
-		p = find_task_by_pid(pid);
-		if (p) {
-			send_sigurg_to_task(p, fown);
-		}
-	} else {
-		do_each_task_pid(-pid, PIDTYPE_PGID, p) {
-			send_sigurg_to_task(p, fown);
-		} while_each_task_pid(-pid, PIDTYPE_PGID, p);
-	}
+	do_each_pid_task(pid, type, p) {
+		send_sigurg_to_task(p, fown);
+	} while_each_pid_task(pid, type, p);
 	read_unlock(&tasklist_lock);
  out_unlock_fown:
 	read_unlock(&fown->lock);
diff --git a/fs/file.c b/fs/file.c
index b3c6b82e6a9d..8e81775c5dc8 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -281,80 +281,70 @@ static struct fdtable *alloc_fdtable(int nr)
 out2:
 	nfds = fdt->max_fdset;
 out:
-  	if (new_openset)
-  		free_fdset(new_openset, nfds);
-  	if (new_execset)
-  		free_fdset(new_execset, nfds);
+	free_fdset(new_openset, nfds);
+	free_fdset(new_execset, nfds);
 	kfree(fdt);
 	return NULL;
 }
 
 /*
- * Expands the file descriptor table - it will allocate a new fdtable and
- * both fd array and fdset. It is expected to be called with the
- * files_lock held.
+ * Expand the file descriptor table.
+ * This function will allocate a new fdtable and both fd array and fdset, of
+ * the given size.
+ * Return <0 error code on error; 1 on successful completion.
+ * The files->file_lock should be held on entry, and will be held on exit.
  */
 static int expand_fdtable(struct files_struct *files, int nr)
 	__releases(files->file_lock)
 	__acquires(files->file_lock)
 {
-	int error = 0;
-	struct fdtable *fdt;
-	struct fdtable *nfdt = NULL;
+	struct fdtable *new_fdt, *cur_fdt;
 
 	spin_unlock(&files->file_lock);
-	nfdt = alloc_fdtable(nr);
-	if (!nfdt) {
-		error = -ENOMEM;
-		spin_lock(&files->file_lock);
-		goto out;
-	}
-
+	new_fdt = alloc_fdtable(nr);
 	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
+	if (!new_fdt)
+		return -ENOMEM;
 	/*
-	 * Check again since another task may have expanded the
-	 * fd table while we dropped the lock
+	 * Check again since another task may have expanded the fd table while
+	 * we dropped the lock
 	 */
-	if (nr >= fdt->max_fds || nr >= fdt->max_fdset) {
-		copy_fdtable(nfdt, fdt);
+	cur_fdt = files_fdtable(files);
+	if (nr >= cur_fdt->max_fds || nr >= cur_fdt->max_fdset) {
+		/* Continue as planned */
+		copy_fdtable(new_fdt, cur_fdt);
+		rcu_assign_pointer(files->fdt, new_fdt);
+		free_fdtable(cur_fdt);
 	} else {
-		/* Somebody expanded while we dropped file_lock */
-		spin_unlock(&files->file_lock);
-		__free_fdtable(nfdt);
-		spin_lock(&files->file_lock);
-		goto out;
+		/* Somebody else expanded, so undo our attempt */
+		__free_fdtable(new_fdt);
 	}
-	rcu_assign_pointer(files->fdt, nfdt);
-	free_fdtable(fdt);
-out:
-	return error;
+	return 1;
 }
 
 /*
  * Expand files.
- * Return <0 on error; 0 nothing done; 1 files expanded, we may have blocked.
- * Should be called with the files->file_lock spinlock held for write.
+ * This function will expand the file structures, if the requested size exceeds
+ * the current capacity and there is room for expansion.
+ * Return <0 error code on error; 0 when nothing done; 1 when files were
+ * expanded and execution may have blocked.
+ * The files->file_lock should be held on entry, and will be held on exit.
  */
 int expand_files(struct files_struct *files, int nr)
 {
-	int err, expand = 0;
 	struct fdtable *fdt;
 
 	fdt = files_fdtable(files);
-	if (nr >= fdt->max_fdset || nr >= fdt->max_fds) {
-		if (fdt->max_fdset >= NR_OPEN ||
-			fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) {
-			err = -EMFILE;
-			goto out;
-		}
-		expand = 1;
-		if ((err = expand_fdtable(files, nr)))
-			goto out;
-	}
-	err = expand;
-out:
-	return err;
+	/* Do we need to expand? */
+	if (nr < fdt->max_fdset && nr < fdt->max_fds)
+		return 0;
+	/* Can we expand? */
+	if (fdt->max_fdset >= NR_OPEN || fdt->max_fds >= NR_OPEN ||
+	    nr >= NR_OPEN)
+		return -EMFILE;
+
+	/* All good, so we try */
+	return expand_fdtable(files, nr);
 }
 
 static void __devinit fdtable_defer_list_init(int cpu)
diff --git a/fs/file_table.c b/fs/file_table.c
index 0131ba06e1ee..24f25a057d9c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -169,11 +169,12 @@ void fastcall __fput(struct file *file)
 	if (file->f_op && file->f_op->release)
 		file->f_op->release(inode, file);
 	security_file_free(file);
-	if (unlikely(inode->i_cdev != NULL))
+	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
 		cdev_put(inode->i_cdev);
 	fops_put(file->f_op);
 	if (file->f_mode & FMODE_WRITE)
 		put_write_access(inode);
+	put_pid(file->f_owner.pid);
 	file_kill(file);
 	file->f_dentry = NULL;
 	file->f_vfsmnt = NULL;
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 9f1072836c8e..e3fa77c6ed56 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -69,8 +69,6 @@ int register_filesystem(struct file_system_type * fs)
 	int res = 0;
 	struct file_system_type ** p;
 
-	if (!fs)
-		return -EINVAL;
 	if (fs->next)
 		return -EBUSY;
 	INIT_LIST_HEAD(&fs->fs_supers);
diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h
index d35979a58743..c8a92652612a 100644
--- a/fs/freevxfs/vxfs.h
+++ b/fs/freevxfs/vxfs.h
@@ -252,7 +252,7 @@ enum {
  * Get filesystem private data from VFS inode.
  */
 #define VXFS_INO(ip) \
-	((struct vxfs_inode_info *)(ip)->u.generic_ip)
+	((struct vxfs_inode_info *)(ip)->i_private)
 
 /*
  * Get filesystem private data from VFS superblock.
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index ca6a39714771..4786d51ad3bd 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -239,11 +239,10 @@ vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip)
 	ip->i_ctime.tv_nsec = 0;
 	ip->i_mtime.tv_nsec = 0;
 
-	ip->i_blksize = PAGE_SIZE;
 	ip->i_blocks = vip->vii_blocks;
 	ip->i_generation = vip->vii_gen;
 
-	ip->u.generic_ip = (void *)vip;
+	ip->i_private = vip;
 	
 }
 
@@ -338,5 +337,5 @@ vxfs_read_inode(struct inode *ip)
 void
 vxfs_clear_inode(struct inode *ip)
 {
-	kmem_cache_free(vxfs_inode_cachep, ip->u.generic_ip);
+	kmem_cache_free(vxfs_inode_cachep, ip->i_private);
 }
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index b74b791fc23b..ac28b0835ffc 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -260,12 +260,17 @@ static struct file_system_type vxfs_fs_type = {
 static int __init
 vxfs_init(void)
 {
+	int rv;
+
 	vxfs_inode_cachep = kmem_cache_create("vxfs_inode",
 			sizeof(struct vxfs_inode_info), 0, 
 			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL, NULL);
-	if (vxfs_inode_cachep)
-		return register_filesystem(&vxfs_fs_type);
-	return -ENOMEM;
+	if (!vxfs_inode_cachep)
+		return -ENOMEM;
+	rv = register_filesystem(&vxfs_fs_type);
+	if (rv < 0)
+		kmem_cache_destroy(vxfs_inode_cachep);
+	return rv;
 }
 
 static void __exit
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 892643dc9af1..c403b66ec83c 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -22,8 +22,7 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
-
-extern struct super_block *blockdev_superblock;
+#include "internal.h"
 
 /**
  *	__mark_inode_dirty -	internal function
@@ -320,7 +319,7 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 
 		if (!bdi_cap_writeback_dirty(bdi)) {
 			list_move(&inode->i_list, &sb->s_dirty);
-			if (sb == blockdev_superblock) {
+			if (sb_is_blkdev_sb(sb)) {
 				/*
 				 * Dirty memory-backed blockdev: the ramdisk
 				 * driver does this.  Skip just this inode
@@ -337,14 +336,14 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 
 		if (wbc->nonblocking && bdi_write_congested(bdi)) {
 			wbc->encountered_congestion = 1;
-			if (sb != blockdev_superblock)
+			if (!sb_is_blkdev_sb(sb))
 				break;		/* Skip a congested fs */
 			list_move(&inode->i_list, &sb->s_dirty);
 			continue;		/* Skip a congested blockdev */
 		}
 
 		if (wbc->bdi && bdi != wbc->bdi) {
-			if (sb != blockdev_superblock)
+			if (!sb_is_blkdev_sb(sb))
 				break;		/* fs has the wrong queue */
 			list_move(&inode->i_list, &sb->s_dirty);
 			continue;		/* blockdev has wrong queue */
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 46fe60b2da23..16b39c053d47 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -23,7 +23,7 @@ static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file)
 {
 	struct fuse_conn *fc;
 	mutex_lock(&fuse_mutex);
-	fc = file->f_dentry->d_inode->u.generic_ip;
+	fc = file->f_dentry->d_inode->i_private;
 	if (fc)
 		fc = fuse_conn_get(fc);
 	mutex_unlock(&fuse_mutex);
@@ -98,7 +98,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
 		inode->i_op = iop;
 	inode->i_fop = fop;
 	inode->i_nlink = nlink;
-	inode->u.generic_ip = fc;
+	inode->i_private = fc;
 	d_add(dentry, inode);
 	return dentry;
 }
@@ -116,7 +116,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
 		return 0;
 
 	parent = fuse_control_sb->s_root;
-	parent->d_inode->i_nlink++;
+	inc_nlink(parent->d_inode);
 	sprintf(name, "%llu", (unsigned long long) fc->id);
 	parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2,
 				     &simple_dir_inode_operations,
@@ -150,7 +150,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
 
 	for (i = fc->ctl_ndents - 1; i >= 0; i--) {
 		struct dentry *dentry = fc->ctl_dentry[i];
-		dentry->d_inode->u.generic_ip = NULL;
+		dentry->d_inode->i_private = NULL;
 		d_drop(dentry);
 		dput(dentry);
 	}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1e2006caf158..66571eafbb1e 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -212,6 +212,7 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
  * Called with fc->lock, unlocks it
  */
 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
+	__releases(fc->lock)
 {
 	void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
 	req->end = NULL;
@@ -640,6 +641,7 @@ static void request_wait(struct fuse_conn *fc)
  */
 static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
 			       const struct iovec *iov, unsigned long nr_segs)
+	__releases(fc->lock)
 {
 	struct fuse_copy_state cs;
 	struct fuse_in_header ih;
@@ -678,14 +680,15 @@ static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
  * request_end().  Otherwise add it to the processing list, and set
  * the 'sent' flag.
  */
-static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
-			      unsigned long nr_segs, loff_t *off)
+static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
+			      unsigned long nr_segs, loff_t pos)
 {
 	int err;
 	struct fuse_req *req;
 	struct fuse_in *in;
 	struct fuse_copy_state cs;
 	unsigned reqsize;
+	struct file *file = iocb->ki_filp;
 	struct fuse_conn *fc = fuse_get_conn(file);
 	if (!fc)
 		return -EPERM;
@@ -759,15 +762,6 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	return err;
 }
 
-static ssize_t fuse_dev_read(struct file *file, char __user *buf,
-			     size_t nbytes, loff_t *off)
-{
-	struct iovec iov;
-	iov.iov_len = nbytes;
-	iov.iov_base = buf;
-	return fuse_dev_readv(file, &iov, 1, off);
-}
-
 /* Look up request on processing list by unique ID */
 static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
 {
@@ -812,15 +806,15 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
  * it from the list and copy the rest of the buffer to the request.
  * The request is finished by calling request_end()
  */
-static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
-			       unsigned long nr_segs, loff_t *off)
+static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
+			       unsigned long nr_segs, loff_t pos)
 {
 	int err;
 	unsigned nbytes = iov_length(iov, nr_segs);
 	struct fuse_req *req;
 	struct fuse_out_header oh;
 	struct fuse_copy_state cs;
-	struct fuse_conn *fc = fuse_get_conn(file);
+	struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
 	if (!fc)
 		return -EPERM;
 
@@ -896,15 +890,6 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	return err;
 }
 
-static ssize_t fuse_dev_write(struct file *file, const char __user *buf,
-			      size_t nbytes, loff_t *off)
-{
-	struct iovec iov;
-	iov.iov_len = nbytes;
-	iov.iov_base = (char __user *) buf;
-	return fuse_dev_writev(file, &iov, 1, off);
-}
-
 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 {
 	unsigned mask = POLLOUT | POLLWRNORM;
@@ -1039,10 +1024,10 @@ static int fuse_dev_fasync(int fd, struct file *file, int on)
 const struct file_operations fuse_dev_operations = {
 	.owner		= THIS_MODULE,
 	.llseek		= no_llseek,
-	.read		= fuse_dev_read,
-	.readv		= fuse_dev_readv,
-	.write		= fuse_dev_write,
-	.writev		= fuse_dev_writev,
+	.read		= do_sync_read,
+	.aio_read	= fuse_dev_read,
+	.write		= do_sync_write,
+	.aio_write	= fuse_dev_write,
 	.poll		= fuse_dev_poll,
 	.release	= fuse_dev_release,
 	.fasync		= fuse_dev_fasync,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 409ce6a7cca4..cfc8f81e60d0 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -138,6 +138,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		struct fuse_entry_out outarg;
 		struct fuse_conn *fc;
 		struct fuse_req *req;
+		struct dentry *parent;
 
 		/* Doesn't hurt to "reset" the validity timeout */
 		fuse_invalidate_entry_cache(entry);
@@ -151,8 +152,10 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		if (IS_ERR(req))
 			return 0;
 
-		fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg);
+		parent = dget_parent(entry);
+		fuse_lookup_init(req, parent->d_inode, entry, &outarg);
 		request_send(fc, req);
+		dput(parent);
 		err = req->out.h.error;
 		/* Zero nodeid is same as -ENOENT */
 		if (!err && !outarg.nodeid)
@@ -163,7 +166,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 				fuse_send_forget(fc, req, outarg.nodeid, 1);
 				return 0;
 			}
+			spin_lock(&fc->lock);
 			fi->nlookup ++;
+			spin_unlock(&fc->lock);
 		}
 		fuse_put_request(fc, req);
 		if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
@@ -175,22 +180,6 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 	return 1;
 }
 
-/*
- * Check if there's already a hashed alias of this directory inode.
- * If yes, then lookup and mkdir must not create a new alias.
- */
-static int dir_alias(struct inode *inode)
-{
-	if (S_ISDIR(inode->i_mode)) {
-		struct dentry *alias = d_find_alias(inode);
-		if (alias) {
-			dput(alias);
-			return 1;
-		}
-	}
-	return 0;
-}
-
 static int invalid_nodeid(u64 nodeid)
 {
 	return !nodeid || nodeid == FUSE_ROOT_ID;
@@ -206,6 +195,24 @@ static int valid_mode(int m)
 		S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
 }
 
+/*
+ * Add a directory inode to a dentry, ensuring that no other dentry
+ * refers to this inode.  Called with fc->inst_mutex.
+ */
+static int fuse_d_add_directory(struct dentry *entry, struct inode *inode)
+{
+	struct dentry *alias = d_find_alias(inode);
+	if (alias) {
+		/* This tries to shrink the subtree below alias */
+		fuse_invalidate_entry(alias);
+		dput(alias);
+		if (!list_empty(&inode->i_dentry))
+			return -EBUSY;
+	}
+	d_add(entry, inode);
+	return 0;
+}
+
 static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 				  struct nameidata *nd)
 {
@@ -241,11 +248,17 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	if (err && err != -ENOENT)
 		return ERR_PTR(err);
 
-	if (inode && dir_alias(inode)) {
-		iput(inode);
-		return ERR_PTR(-EIO);
-	}
-	d_add(entry, inode);
+	if (inode && S_ISDIR(inode->i_mode)) {
+		mutex_lock(&fc->inst_mutex);
+		err = fuse_d_add_directory(entry, inode);
+		mutex_unlock(&fc->inst_mutex);
+		if (err) {
+			iput(inode);
+			return ERR_PTR(err);
+		}
+	} else
+		d_add(entry, inode);
+
 	entry->d_op = &fuse_dentry_operations;
 	if (!err)
 		fuse_change_timeout(entry, &outarg);
@@ -401,12 +414,22 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 	}
 	fuse_put_request(fc, req);
 
-	if (dir_alias(inode)) {
-		iput(inode);
-		return -EIO;
-	}
+	if (S_ISDIR(inode->i_mode)) {
+		struct dentry *alias;
+		mutex_lock(&fc->inst_mutex);
+		alias = d_find_alias(inode);
+		if (alias) {
+			/* New directory must have moved since mkdir */
+			mutex_unlock(&fc->inst_mutex);
+			dput(alias);
+			iput(inode);
+			return -EBUSY;
+		}
+		d_instantiate(entry, inode);
+		mutex_unlock(&fc->inst_mutex);
+	} else
+		d_instantiate(entry, inode);
 
-	d_instantiate(entry, inode);
 	fuse_change_timeout(entry, &outarg);
 	fuse_invalidate_attr(dir);
 	return 0;
@@ -508,7 +531,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 		/* Set nlink to zero so the inode can be cleared, if
                    the inode does have more links this will be
                    discovered at the next lookup/getattr */
-		inode->i_nlink = 0;
+		clear_nlink(inode);
 		fuse_invalidate_attr(inode);
 		fuse_invalidate_attr(dir);
 		fuse_invalidate_entry_cache(entry);
@@ -534,7 +557,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (!err) {
-		entry->d_inode->i_nlink = 0;
+		clear_nlink(entry->d_inode);
 		fuse_invalidate_attr(dir);
 		fuse_invalidate_entry_cache(entry);
 	} else if (err == -EINTR)
@@ -776,7 +799,7 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
 		if ((mask & MAY_EXEC) && !S_ISDIR(mode) && !(mode & S_IXUGO))
 			return -EACCES;
 
-		if (nd && (nd->flags & LOOKUP_ACCESS))
+		if (nd && (nd->flags & (LOOKUP_ACCESS | LOOKUP_CHDIR)))
 			return fuse_access(inode, mask);
 		return 0;
 	}
@@ -935,14 +958,30 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
 	}
 }
 
+static void fuse_vmtruncate(struct inode *inode, loff_t offset)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	int need_trunc;
+
+	spin_lock(&fc->lock);
+	need_trunc = inode->i_size > offset;
+	i_size_write(inode, offset);
+	spin_unlock(&fc->lock);
+
+	if (need_trunc) {
+		struct address_space *mapping = inode->i_mapping;
+		unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+		truncate_inode_pages(mapping, offset);
+	}
+}
+
 /*
  * Set attributes, and at the same time refresh them.
  *
  * Truncation is slightly complicated, because the 'truncate' request
  * may fail, in which case we don't want to touch the mapping.
- * vmtruncate() doesn't allow for this case.  So do the rlimit
- * checking by hand and call vmtruncate() only after the file has
- * actually been truncated.
+ * vmtruncate() doesn't allow for this case, so do the rlimit checking
+ * and the actual truncation by hand.
  */
 static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 {
@@ -993,12 +1032,8 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 			make_bad_inode(inode);
 			err = -EIO;
 		} else {
-			if (is_truncate) {
-				loff_t origsize = i_size_read(inode);
-				i_size_write(inode, outarg.attr.size);
-				if (origsize > outarg.attr.size)
-					vmtruncate(inode, outarg.attr.size);
-			}
+			if (is_truncate)
+				fuse_vmtruncate(inode, outarg.attr.size);
 			fuse_change_attributes(inode, &outarg.attr);
 			fi->i_time = time_to_jiffies(outarg.attr_valid,
 						     outarg.attr_valid_nsec);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5c4fcd1dbf59..2bb5ace3882d 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -481,8 +481,10 @@ static int fuse_commit_write(struct file *file, struct page *page,
 		err = -EIO;
 	if (!err) {
 		pos += count;
-		if (pos > i_size_read(inode))
+		spin_lock(&fc->lock);
+		if (pos > inode->i_size)
 			i_size_write(inode, pos);
+		spin_unlock(&fc->lock);
 
 		if (offset == 0 && to == PAGE_CACHE_SIZE) {
 			clear_page_dirty(page);
@@ -586,8 +588,12 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 	}
 	fuse_put_request(fc, req);
 	if (res > 0) {
-		if (write && pos > i_size_read(inode))
-			i_size_write(inode, pos);
+		if (write) {
+			spin_lock(&fc->lock);
+			if (pos > inode->i_size)
+				i_size_write(inode, pos);
+			spin_unlock(&fc->lock);
+		}
 		*ppos = pos;
 	}
 	fuse_invalidate_attr(inode);
@@ -753,8 +759,10 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
 
 static const struct file_operations fuse_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= fuse_file_mmap,
 	.open		= fuse_open,
 	.flush		= fuse_flush,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 69c7750d55b8..91edb8932d90 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -239,6 +239,9 @@ struct fuse_conn {
 	/** Lock protecting accessess to  members of this structure */
 	spinlock_t lock;
 
+	/** Mutex protecting against directory alias creation */
+	struct mutex inst_mutex;
+
 	/** Refcount */
 	atomic_t count;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7d25092262ae..fc4203570370 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -109,6 +109,7 @@ static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
 
 void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
 {
+	struct fuse_conn *fc = get_fuse_conn(inode);
 	if (S_ISREG(inode->i_mode) && i_size_read(inode) != attr->size)
 		invalidate_inode_pages(inode->i_mapping);
 
@@ -117,8 +118,9 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
 	inode->i_nlink   = attr->nlink;
 	inode->i_uid     = attr->uid;
 	inode->i_gid     = attr->gid;
+	spin_lock(&fc->lock);
 	i_size_write(inode, attr->size);
-	inode->i_blksize = PAGE_CACHE_SIZE;
+	spin_unlock(&fc->lock);
 	inode->i_blocks  = attr->blocks;
 	inode->i_atime.tv_sec   = attr->atime;
 	inode->i_atime.tv_nsec  = attr->atimensec;
@@ -131,7 +133,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr)
 static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
 {
 	inode->i_mode = attr->mode & S_IFMT;
-	i_size_write(inode, attr->size);
+	inode->i_size = attr->size;
 	if (S_ISREG(inode->i_mode)) {
 		fuse_init_common(inode);
 		fuse_init_file_inode(inode);
@@ -170,7 +172,6 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
 	struct inode *inode;
 	struct fuse_inode *fi;
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
-	int retried = 0;
 
  retry:
 	inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid);
@@ -184,16 +185,16 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
 		fuse_init_inode(inode, attr);
 		unlock_new_inode(inode);
 	} else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
-		BUG_ON(retried);
 		/* Inode has changed type, any I/O on the old should fail */
 		make_bad_inode(inode);
 		iput(inode);
-		retried = 1;
 		goto retry;
 	}
 
 	fi = get_fuse_inode(inode);
+	spin_lock(&fc->lock);
 	fi->nlookup ++;
+	spin_unlock(&fc->lock);
 	fuse_change_attributes(inode, attr);
 	return inode;
 }
@@ -252,6 +253,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
 	memset(&outarg, 0, sizeof(outarg));
 	req->in.numargs = 0;
 	req->in.h.opcode = FUSE_STATFS;
+	req->in.h.nodeid = get_node_id(dentry->d_inode);
 	req->out.numargs = 1;
 	req->out.args[0].size =
 		fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
@@ -377,6 +379,7 @@ static struct fuse_conn *new_conn(void)
 	fc = kzalloc(sizeof(*fc), GFP_KERNEL);
 	if (fc) {
 		spin_lock_init(&fc->lock);
+		mutex_init(&fc->inst_mutex);
 		atomic_set(&fc->count, 1);
 		init_waitqueue_head(&fc->waitq);
 		init_waitqueue_head(&fc->blocked_waitq);
@@ -396,8 +399,10 @@ static struct fuse_conn *new_conn(void)
 
 void fuse_conn_put(struct fuse_conn *fc)
 {
-	if (atomic_dec_and_test(&fc->count))
+	if (atomic_dec_and_test(&fc->count)) {
+		mutex_destroy(&fc->inst_mutex);
 		kfree(fc);
+	}
 }
 
 struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
new file mode 100644
index 000000000000..9ccb78947171
--- /dev/null
+++ b/fs/generic_acl.c
@@ -0,0 +1,197 @@
+/*
+ * fs/generic_acl.c
+ *
+ * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/generic_acl.h>
+
+/**
+ * generic_acl_list  -  Generic xattr_handler->list() operation
+ * @ops:	Filesystem specific getacl and setacl callbacks
+ */
+size_t
+generic_acl_list(struct inode *inode, struct generic_acl_operations *ops,
+		 int type, char *list, size_t list_size)
+{
+	struct posix_acl *acl;
+	const char *name;
+	size_t size;
+
+	acl = ops->getacl(inode, type);
+	if (!acl)
+		return 0;
+	posix_acl_release(acl);
+
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			name = POSIX_ACL_XATTR_ACCESS;
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			name = POSIX_ACL_XATTR_DEFAULT;
+			break;
+
+		default:
+			return 0;
+	}
+	size = strlen(name) + 1;
+	if (list && size <= list_size)
+		memcpy(list, name, size);
+	return size;
+}
+
+/**
+ * generic_acl_get  -  Generic xattr_handler->get() operation
+ * @ops:	Filesystem specific getacl and setacl callbacks
+ */
+int
+generic_acl_get(struct inode *inode, struct generic_acl_operations *ops,
+		int type, void *buffer, size_t size)
+{
+	struct posix_acl *acl;
+	int error;
+
+	acl = ops->getacl(inode, type);
+	if (!acl)
+		return -ENODATA;
+	error = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+/**
+ * generic_acl_set  -  Generic xattr_handler->set() operation
+ * @ops:	Filesystem specific getacl and setacl callbacks
+ */
+int
+generic_acl_set(struct inode *inode, struct generic_acl_operations *ops,
+		int type, const void *value, size_t size)
+{
+	struct posix_acl *acl = NULL;
+	int error;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER))
+		return -EPERM;
+	if (value) {
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+	}
+	if (acl) {
+		mode_t mode;
+
+		error = posix_acl_valid(acl);
+		if (error)
+			goto failed;
+		switch(type) {
+			case ACL_TYPE_ACCESS:
+				mode = inode->i_mode;
+				error = posix_acl_equiv_mode(acl, &mode);
+				if (error < 0)
+					goto failed;
+				inode->i_mode = mode;
+				if (error == 0) {
+					posix_acl_release(acl);
+					acl = NULL;
+				}
+				break;
+
+			case ACL_TYPE_DEFAULT:
+				if (!S_ISDIR(inode->i_mode)) {
+					error = -EINVAL;
+					goto failed;
+				}
+				break;
+		}
+	}
+	ops->setacl(inode, type, acl);
+	error = 0;
+failed:
+	posix_acl_release(acl);
+	return error;
+}
+
+/**
+ * generic_acl_init  -  Take care of acl inheritance at @inode create time
+ * @ops:	Filesystem specific getacl and setacl callbacks
+ *
+ * Files created inside a directory with a default ACL inherit the
+ * directory's default ACL.
+ */
+int
+generic_acl_init(struct inode *inode, struct inode *dir,
+		 struct generic_acl_operations *ops)
+{
+	struct posix_acl *acl = NULL;
+	mode_t mode = inode->i_mode;
+	int error;
+
+	inode->i_mode = mode & ~current->fs->umask;
+	if (!S_ISLNK(inode->i_mode))
+		acl = ops->getacl(dir, ACL_TYPE_DEFAULT);
+	if (acl) {
+		struct posix_acl *clone;
+
+		if (S_ISDIR(inode->i_mode)) {
+			clone = posix_acl_clone(acl, GFP_KERNEL);
+			error = -ENOMEM;
+			if (!clone)
+				goto cleanup;
+			ops->setacl(inode, ACL_TYPE_DEFAULT, clone);
+			posix_acl_release(clone);
+		}
+		clone = posix_acl_clone(acl, GFP_KERNEL);
+		error = -ENOMEM;
+		if (!clone)
+			goto cleanup;
+		error = posix_acl_create_masq(clone, &mode);
+		if (error >= 0) {
+			inode->i_mode = mode;
+			if (error > 0)
+				ops->setacl(inode, ACL_TYPE_ACCESS, clone);
+		}
+		posix_acl_release(clone);
+	}
+	error = 0;
+
+cleanup:
+	posix_acl_release(acl);
+	return error;
+}
+
+/**
+ * generic_acl_chmod  -  change the access acl of @inode upon chmod()
+ * @ops:	FIlesystem specific getacl and setacl callbacks
+ *
+ * A chmod also changes the permissions of the owner, group/mask, and
+ * other ACL entries.
+ */
+int
+generic_acl_chmod(struct inode *inode, struct generic_acl_operations *ops)
+{
+	struct posix_acl *acl, *clone;
+	int error = 0;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+	acl = ops->getacl(inode, ACL_TYPE_ACCESS);
+	if (acl) {
+		clone = posix_acl_clone(acl, GFP_KERNEL);
+		posix_acl_release(acl);
+		if (!clone)
+			return -ENOMEM;
+		error = posix_acl_chmod_masq(clone, inode->i_mode);
+		if (!error)
+			ops->setacl(inode, ACL_TYPE_ACCESS, clone);
+		posix_acl_release(clone);
+	}
+	return error;
+}
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
new file mode 100644
index 000000000000..8c27de8b9568
--- /dev/null
+++ b/fs/gfs2/Kconfig
@@ -0,0 +1,44 @@
+config GFS2_FS
+	tristate "GFS2 file system support"
+	depends on EXPERIMENTAL
+	select FS_POSIX_ACL
+	help
+	A cluster filesystem.
+
+	Allows a cluster of computers to simultaneously use a block device
+	that is shared between them (with FC, iSCSI, NBD, etc...).  GFS reads
+	and writes to the block device like a local filesystem, but also uses
+	a lock module to allow the computers coordinate their I/O so
+	filesystem consistency is maintained.  One of the nifty features of
+	GFS is perfect consistency -- changes made to the filesystem on one
+	machine show up immediately on all other machines in the cluster.
+
+	To use the GFS2 filesystem, you will need to enable one or more of
+	the below locking modules. Documentation and utilities for GFS2 can
+	be found here: http://sources.redhat.com/cluster
+
+config GFS2_FS_LOCKING_NOLOCK
+	tristate "GFS2 \"nolock\" locking module"
+	depends on GFS2_FS
+	help
+	Single node locking module for GFS2.
+
+	Use this module if you want to use GFS2 on a single node without
+	its clustering features. You can still take advantage of the
+	large file support, and upgrade to running a full cluster later on
+	if required.
+
+	If you will only be using GFS2 in cluster mode, you do not need this
+	module.
+
+config GFS2_FS_LOCKING_DLM
+	tristate "GFS2 DLM locking module"
+	depends on GFS2_FS
+	select DLM
+	help
+	Multiple node locking module for GFS2
+
+	Most users of GFS2 will require this module. It provides the locking
+	interface between GFS2 and the DLM, which is required to use GFS2
+	in a cluster environment.
+
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
new file mode 100644
index 000000000000..e3f1ada643ac
--- /dev/null
+++ b/fs/gfs2/Makefile
@@ -0,0 +1,10 @@
+obj-$(CONFIG_GFS2_FS) += gfs2.o
+gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
+	glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \
+	mount.o ondisk.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
+	ops_fstype.o ops_inode.o ops_super.o ops_vm.o quota.o \
+	recovery.o rgrp.o super.o sys.o trans.o util.o
+
+obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
+obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
+
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
new file mode 100644
index 000000000000..5f959b8ce406
--- /dev/null
+++ b/fs/gfs2/acl.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "trans.h"
+#include "util.h"
+
+#define ACL_ACCESS 1
+#define ACL_DEFAULT 0
+
+int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
+		      struct gfs2_ea_request *er,
+		      int *remove, mode_t *mode)
+{
+	struct posix_acl *acl;
+	int error;
+
+	error = gfs2_acl_validate_remove(ip, access);
+	if (error)
+		return error;
+
+	if (!er->er_data)
+		return -EINVAL;
+
+	acl = posix_acl_from_xattr(er->er_data, er->er_data_len);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (!acl) {
+		*remove = 1;
+		return 0;
+	}
+
+	error = posix_acl_valid(acl);
+	if (error)
+		goto out;
+
+	if (access) {
+		error = posix_acl_equiv_mode(acl, mode);
+		if (!error)
+			*remove = 1;
+		else if (error > 0)
+			error = 0;
+	}
+
+out:
+	posix_acl_release(acl);
+	return error;
+}
+
+int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
+{
+	if (!GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl)
+		return -EOPNOTSUPP;
+	if (current->fsuid != ip->i_di.di_uid && !capable(CAP_FOWNER))
+		return -EPERM;
+	if (S_ISLNK(ip->i_di.di_mode))
+		return -EOPNOTSUPP;
+	if (!access && !S_ISDIR(ip->i_di.di_mode))
+		return -EACCES;
+
+	return 0;
+}
+
+static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
+		   struct gfs2_ea_location *el, char **data, unsigned int *len)
+{
+	struct gfs2_ea_request er;
+	struct gfs2_ea_location el_this;
+	int error;
+
+	if (!ip->i_di.di_eattr)
+		return 0;
+
+	memset(&er, 0, sizeof(struct gfs2_ea_request));
+	if (access) {
+		er.er_name = GFS2_POSIX_ACL_ACCESS;
+		er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
+	} else {
+		er.er_name = GFS2_POSIX_ACL_DEFAULT;
+		er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
+	}
+	er.er_type = GFS2_EATYPE_SYS;
+
+	if (!el)
+		el = &el_this;
+
+	error = gfs2_ea_find(ip, &er, el);
+	if (error)
+		return error;
+	if (!el->el_ea)
+		return 0;
+	if (!GFS2_EA_DATA_LEN(el->el_ea))
+		goto out;
+
+	er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
+	er.er_data = kmalloc(er.er_data_len, GFP_KERNEL);
+	error = -ENOMEM;
+	if (!er.er_data)
+		goto out;
+
+	error = gfs2_ea_get_copy(ip, el, er.er_data);
+	if (error)
+		goto out_kfree;
+
+	if (acl) {
+		*acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
+		if (IS_ERR(*acl))
+			error = PTR_ERR(*acl);
+	}
+
+out_kfree:
+	if (error || !data)
+		kfree(er.er_data);
+	else {
+		*data = er.er_data;
+		*len = er.er_data_len;
+	}
+out:
+	if (error || el == &el_this)
+		brelse(el->el_bh);
+	return error;
+}
+
+/**
+ * gfs2_check_acl_locked - Check an ACL to see if we're allowed to do something
+ * @inode: the file we want to do something to
+ * @mask: what we want to do
+ *
+ * Returns: errno
+ */
+
+int gfs2_check_acl_locked(struct inode *inode, int mask)
+{
+	struct posix_acl *acl = NULL;
+	int error;
+
+	error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL);
+	if (error)
+		return error;
+
+	if (acl) {
+		error = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return error;
+	}
+
+	return -EAGAIN;
+}
+
+int gfs2_check_acl(struct inode *inode, int mask)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder i_gh;
+	int error;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+	if (!error) {
+		error = gfs2_check_acl_locked(inode, mask);
+		gfs2_glock_dq_uninit(&i_gh);
+	}
+
+	return error;
+}
+
+static int munge_mode(struct gfs2_inode *ip, mode_t mode)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (error)
+		return error;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		gfs2_assert_withdraw(sdp,
+				(ip->i_di.di_mode & S_IFMT) == (mode & S_IFMT));
+		ip->i_di.di_mode = mode;
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(sdp);
+
+	return 0;
+}
+
+int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct posix_acl *acl = NULL, *clone;
+	struct gfs2_ea_request er;
+	mode_t mode = ip->i_di.di_mode;
+	int error;
+
+	if (!sdp->sd_args.ar_posix_acl)
+		return 0;
+	if (S_ISLNK(ip->i_di.di_mode))
+		return 0;
+
+	memset(&er, 0, sizeof(struct gfs2_ea_request));
+	er.er_type = GFS2_EATYPE_SYS;
+
+	error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
+			&er.er_data, &er.er_data_len);
+	if (error)
+		return error;
+	if (!acl) {
+		mode &= ~current->fs->umask;
+		if (mode != ip->i_di.di_mode)
+			error = munge_mode(ip, mode);
+		return error;
+	}
+
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	error = -ENOMEM;
+	if (!clone)
+		goto out;
+	posix_acl_release(acl);
+	acl = clone;
+
+	if (S_ISDIR(ip->i_di.di_mode)) {
+		er.er_name = GFS2_POSIX_ACL_DEFAULT;
+		er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
+		error = gfs2_system_eaops.eo_set(ip, &er);
+		if (error)
+			goto out;
+	}
+
+	error = posix_acl_create_masq(acl, &mode);
+	if (error < 0)
+		goto out;
+	if (error > 0) {
+		er.er_name = GFS2_POSIX_ACL_ACCESS;
+		er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
+		posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
+		er.er_mode = mode;
+		er.er_flags = GFS2_ERF_MODE;
+		error = gfs2_system_eaops.eo_set(ip, &er);
+		if (error)
+			goto out;
+	} else
+		munge_mode(ip, mode);
+
+out:
+	posix_acl_release(acl);
+	kfree(er.er_data);
+	return error;
+}
+
+int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
+{
+	struct posix_acl *acl = NULL, *clone;
+	struct gfs2_ea_location el;
+	char *data;
+	unsigned int len;
+	int error;
+
+	error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
+	if (error)
+		return error;
+	if (!acl)
+		return gfs2_setattr_simple(ip, attr);
+
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	error = -ENOMEM;
+	if (!clone)
+		goto out;
+	posix_acl_release(acl);
+	acl = clone;
+
+	error = posix_acl_chmod_masq(acl, attr->ia_mode);
+	if (!error) {
+		posix_acl_to_xattr(acl, data, len);
+		error = gfs2_ea_acl_chmod(ip, &el, attr, data);
+	}
+
+out:
+	posix_acl_release(acl);
+	brelse(el.el_bh);
+	kfree(data);
+	return error;
+}
+
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
new file mode 100644
index 000000000000..05c294fe0d78
--- /dev/null
+++ b/fs/gfs2/acl.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __ACL_DOT_H__
+#define __ACL_DOT_H__
+
+#include "incore.h"
+
+#define GFS2_POSIX_ACL_ACCESS		"posix_acl_access"
+#define GFS2_POSIX_ACL_ACCESS_LEN	16
+#define GFS2_POSIX_ACL_DEFAULT		"posix_acl_default"
+#define GFS2_POSIX_ACL_DEFAULT_LEN	17
+
+#define GFS2_ACL_IS_ACCESS(name, len) \
+         ((len) == GFS2_POSIX_ACL_ACCESS_LEN && \
+         !memcmp(GFS2_POSIX_ACL_ACCESS, (name), (len)))
+
+#define GFS2_ACL_IS_DEFAULT(name, len) \
+         ((len) == GFS2_POSIX_ACL_DEFAULT_LEN && \
+         !memcmp(GFS2_POSIX_ACL_DEFAULT, (name), (len)))
+
+struct gfs2_ea_request;
+
+int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
+			  struct gfs2_ea_request *er,
+			  int *remove, mode_t *mode);
+int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access);
+int gfs2_check_acl_locked(struct inode *inode, int mask);
+int gfs2_check_acl(struct inode *inode, int mask);
+int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip);
+int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
+
+#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
new file mode 100644
index 000000000000..06e9a8cb45e9
--- /dev/null
+++ b/fs/gfs2/bmap.c
@@ -0,0 +1,1222 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "dir.h"
+#include "util.h"
+#include "ops_address.h"
+
+/* This doesn't need to be that large as max 64 bit pointers in a 4k
+ * block is 512, so __u16 is fine for that. It saves stack space to
+ * keep it small.
+ */
+struct metapath {
+	__u16 mp_list[GFS2_MAX_META_HEIGHT];
+};
+
+typedef int (*block_call_t) (struct gfs2_inode *ip, struct buffer_head *dibh,
+			     struct buffer_head *bh, u64 *top,
+			     u64 *bottom, unsigned int height,
+			     void *data);
+
+struct strip_mine {
+	int sm_first;
+	unsigned int sm_height;
+};
+
+/**
+ * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @block: the block number that was allocated
+ * @private: any locked page held by the caller process
+ *
+ * Returns: errno
+ */
+
+static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
+			       u64 block, struct page *page)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct inode *inode = &ip->i_inode;
+	struct buffer_head *bh;
+	int release = 0;
+
+	if (!page || page->index) {
+		page = grab_cache_page(inode->i_mapping, 0);
+		if (!page)
+			return -ENOMEM;
+		release = 1;
+	}
+
+	if (!PageUptodate(page)) {
+		void *kaddr = kmap(page);
+
+		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
+		       ip->i_di.di_size);
+		memset(kaddr + ip->i_di.di_size, 0,
+		       PAGE_CACHE_SIZE - ip->i_di.di_size);
+		kunmap(page);
+
+		SetPageUptodate(page);
+	}
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << inode->i_blkbits,
+				     (1 << BH_Uptodate));
+
+	bh = page_buffers(page);
+
+	if (!buffer_mapped(bh))
+		map_bh(bh, inode->i_sb, block);
+
+	set_buffer_uptodate(bh);
+	if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+		gfs2_trans_add_bh(ip->i_gl, bh, 0);
+	mark_buffer_dirty(bh);
+
+	if (release) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
+ * @ip: The GFS2 inode to unstuff
+ * @unstuffer: the routine that handles unstuffing a non-zero length file
+ * @private: private data for the unstuffer
+ *
+ * This routine unstuffs a dinode and returns it to a "normal" state such
+ * that the height can be grown in the traditional way.
+ *
+ * Returns: errno
+ */
+
+int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
+{
+	struct buffer_head *bh, *dibh;
+	struct gfs2_dinode *di;
+	u64 block = 0;
+	int isdir = gfs2_is_dir(ip);
+	int error;
+
+	down_write(&ip->i_rw_mutex);
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out;
+
+	if (ip->i_di.di_size) {
+		/* Get a free block, fill it with the stuffed data,
+		   and write it out to disk */
+
+		if (isdir) {
+			block = gfs2_alloc_meta(ip);
+
+			error = gfs2_dir_get_new_buffer(ip, block, &bh);
+			if (error)
+				goto out_brelse;
+			gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
+					      dibh, sizeof(struct gfs2_dinode));
+			brelse(bh);
+		} else {
+			block = gfs2_alloc_data(ip);
+
+			error = gfs2_unstuffer_page(ip, dibh, block, page);
+			if (error)
+				goto out_brelse;
+		}
+	}
+
+	/*  Set up the pointer to the new block  */
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	di = (struct gfs2_dinode *)dibh->b_data;
+	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+
+	if (ip->i_di.di_size) {
+		*(__be64 *)(di + 1) = cpu_to_be64(block);
+		ip->i_di.di_blocks++;
+		di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
+	}
+
+	ip->i_di.di_height = 1;
+	di->di_height = cpu_to_be16(1);
+
+out_brelse:
+	brelse(dibh);
+out:
+	up_write(&ip->i_rw_mutex);
+	return error;
+}
+
+/**
+ * calc_tree_height - Calculate the height of a metadata tree
+ * @ip: The GFS2 inode
+ * @size: The proposed size of the file
+ *
+ * Work out how tall a metadata tree needs to be in order to accommodate a
+ * file of a particular size. If size is less than the current size of
+ * the inode, then the current size of the inode is used instead of the
+ * supplied one.
+ *
+ * Returns: the height the tree should be
+ */
+
+static unsigned int calc_tree_height(struct gfs2_inode *ip, u64 size)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	u64 *arr;
+	unsigned int max, height;
+
+	if (ip->i_di.di_size > size)
+		size = ip->i_di.di_size;
+
+	if (gfs2_is_dir(ip)) {
+		arr = sdp->sd_jheightsize;
+		max = sdp->sd_max_jheight;
+	} else {
+		arr = sdp->sd_heightsize;
+		max = sdp->sd_max_height;
+	}
+
+	for (height = 0; height < max; height++)
+		if (arr[height] >= size)
+			break;
+
+	return height;
+}
+
+/**
+ * build_height - Build a metadata tree of the requested height
+ * @ip: The GFS2 inode
+ * @height: The height to build to
+ *
+ *
+ * Returns: errno
+ */
+
+static int build_height(struct inode *inode, unsigned height)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	unsigned new_height = height - ip->i_di.di_height;
+	struct buffer_head *dibh;
+	struct buffer_head *blocks[GFS2_MAX_META_HEIGHT];
+	struct gfs2_dinode *di;
+	int error;
+	u64 *bp;
+	u64 bn;
+	unsigned n;
+
+	if (height <= ip->i_di.di_height)
+		return 0;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		return error;
+
+	for(n = 0; n < new_height; n++) {
+		bn = gfs2_alloc_meta(ip);
+		blocks[n] = gfs2_meta_new(ip->i_gl, bn);
+		gfs2_trans_add_bh(ip->i_gl, blocks[n], 1);
+	}
+
+	n = 0;
+	bn = blocks[0]->b_blocknr;
+	if (new_height > 1) {
+		for(; n < new_height-1; n++) {
+			gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN,
+					  GFS2_FORMAT_IN);
+			gfs2_buffer_clear_tail(blocks[n],
+					       sizeof(struct gfs2_meta_header));
+			bp = (u64 *)(blocks[n]->b_data +
+				     sizeof(struct gfs2_meta_header));
+			*bp = cpu_to_be64(blocks[n+1]->b_blocknr);
+			brelse(blocks[n]);
+			blocks[n] = NULL;
+		}
+	}
+	gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+	gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header),
+			      dibh, sizeof(struct gfs2_dinode));
+	brelse(blocks[n]);
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	di = (struct gfs2_dinode *)dibh->b_data;
+	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+	*(__be64 *)(di + 1) = cpu_to_be64(bn);
+	ip->i_di.di_height += new_height;
+	ip->i_di.di_blocks += new_height;
+	di->di_height = cpu_to_be16(ip->i_di.di_height);
+	di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
+	brelse(dibh);
+	return error;
+}
+
+/**
+ * find_metapath - Find path through the metadata tree
+ * @ip: The inode pointer
+ * @mp: The metapath to return the result in
+ * @block: The disk block to look up
+ *
+ *   This routine returns a struct metapath structure that defines a path
+ *   through the metadata of inode "ip" to get to block "block".
+ *
+ *   Example:
+ *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
+ *   filesystem with a blocksize of 4096.
+ *
+ *   find_metapath() would return a struct metapath structure set to:
+ *   mp_offset = 101342453, mp_height = 3, mp_list[0] = 0, mp_list[1] = 48,
+ *   and mp_list[2] = 165.
+ *
+ *   That means that in order to get to the block containing the byte at
+ *   offset 101342453, we would load the indirect block pointed to by pointer
+ *   0 in the dinode.  We would then load the indirect block pointed to by
+ *   pointer 48 in that indirect block.  We would then load the data block
+ *   pointed to by pointer 165 in that indirect block.
+ *
+ *             ----------------------------------------
+ *             | Dinode |                             |
+ *             |        |                            4|
+ *             |        |0 1 2 3 4 5                 9|
+ *             |        |                            6|
+ *             ----------------------------------------
+ *                       |
+ *                       |
+ *                       V
+ *             ----------------------------------------
+ *             | Indirect Block                       |
+ *             |                                     5|
+ *             |            4 4 4 4 4 5 5            1|
+ *             |0           5 6 7 8 9 0 1            2|
+ *             ----------------------------------------
+ *                                |
+ *                                |
+ *                                V
+ *             ----------------------------------------
+ *             | Indirect Block                       |
+ *             |                         1 1 1 1 1   5|
+ *             |                         6 6 6 6 6   1|
+ *             |0                        3 4 5 6 7   2|
+ *             ----------------------------------------
+ *                                           |
+ *                                           |
+ *                                           V
+ *             ----------------------------------------
+ *             | Data block containing offset         |
+ *             |            101342453                 |
+ *             |                                      |
+ *             |                                      |
+ *             ----------------------------------------
+ *
+ */
+
+static void find_metapath(struct gfs2_inode *ip, u64 block,
+			  struct metapath *mp)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	u64 b = block;
+	unsigned int i;
+
+	for (i = ip->i_di.di_height; i--;)
+		mp->mp_list[i] = do_div(b, sdp->sd_inptrs);
+
+}
+
+/**
+ * metapointer - Return pointer to start of metadata in a buffer
+ * @bh: The buffer
+ * @height: The metadata height (0 = dinode)
+ * @mp: The metapath
+ *
+ * Return a pointer to the block number of the next height of the metadata
+ * tree given a buffer containing the pointer to the current height of the
+ * metadata tree.
+ */
+
+static inline u64 *metapointer(struct buffer_head *bh, int *boundary,
+			       unsigned int height, const struct metapath *mp)
+{
+	unsigned int head_size = (height > 0) ?
+		sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
+	u64 *ptr;
+	*boundary = 0;
+	ptr = ((u64 *)(bh->b_data + head_size)) + mp->mp_list[height];
+	if (ptr + 1 == (u64 *)(bh->b_data + bh->b_size))
+		*boundary = 1;
+	return ptr;
+}
+
+/**
+ * lookup_block - Get the next metadata block in metadata tree
+ * @ip: The GFS2 inode
+ * @bh: Buffer containing the pointers to metadata blocks
+ * @height: The height of the tree (0 = dinode)
+ * @mp: The metapath
+ * @create: Non-zero if we may create a new meatdata block
+ * @new: Used to indicate if we did create a new metadata block
+ * @block: the returned disk block number
+ *
+ * Given a metatree, complete to a particular height, checks to see if the next
+ * height of the tree exists. If not the next height of the tree is created.
+ * The block number of the next height of the metadata tree is returned.
+ *
+ */
+
+static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh,
+			unsigned int height, struct metapath *mp, int create,
+			int *new, u64 *block)
+{
+	int boundary;
+	u64 *ptr = metapointer(bh, &boundary, height, mp);
+
+	if (*ptr) {
+		*block = be64_to_cpu(*ptr);
+		return boundary;
+	}
+
+	*block = 0;
+
+	if (!create)
+		return 0;
+
+	if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip))
+		*block = gfs2_alloc_data(ip);
+	else
+		*block = gfs2_alloc_meta(ip);
+
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+
+	*ptr = cpu_to_be64(*block);
+	ip->i_di.di_blocks++;
+
+	*new = 1;
+	return 0;
+}
+
+/**
+ * gfs2_block_pointers - Map a block from an inode to a disk block
+ * @inode: The inode
+ * @lblock: The logical block number
+ * @map_bh: The bh to be mapped
+ * @mp: metapath to use
+ *
+ * Find the block number on the current device which corresponds to an
+ * inode's block. If the block had to be created, "new" will be set.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_block_pointers(struct inode *inode, u64 lblock, int create,
+			       struct buffer_head *bh_map, struct metapath *mp)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct buffer_head *bh;
+	unsigned int bsize;
+	unsigned int height;
+	unsigned int end_of_metadata;
+	unsigned int x;
+	int error = 0;
+	int new = 0;
+	u64 dblock = 0;
+	int boundary;
+	unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
+
+	BUG_ON(maxlen == 0);
+
+	if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip)))
+		return 0;
+
+	bsize = gfs2_is_dir(ip) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize;
+
+	height = calc_tree_height(ip, (lblock + 1) * bsize);
+	if (ip->i_di.di_height < height) {
+		if (!create)
+			return 0;
+
+		error = build_height(inode, height);
+		if (error)
+			return error;
+	}
+
+	find_metapath(ip, lblock, mp);
+	end_of_metadata = ip->i_di.di_height - 1;
+
+	error = gfs2_meta_inode_buffer(ip, &bh);
+	if (error)
+		return error;
+
+	for (x = 0; x < end_of_metadata; x++) {
+		lookup_block(ip, bh, x, mp, create, &new, &dblock);
+		brelse(bh);
+		if (!dblock)
+			return 0;
+
+		error = gfs2_meta_indirect_buffer(ip, x+1, dblock, new, &bh);
+		if (error)
+			return error;
+	}
+
+	boundary = lookup_block(ip, bh, end_of_metadata, mp, create, &new, &dblock);
+	clear_buffer_mapped(bh_map);
+	clear_buffer_new(bh_map);
+	clear_buffer_boundary(bh_map);
+
+	if (dblock) {
+		map_bh(bh_map, inode->i_sb, dblock);
+		if (boundary)
+			set_buffer_boundary(bh);
+		if (new) {
+			struct buffer_head *dibh;
+			error = gfs2_meta_inode_buffer(ip, &dibh);
+			if (!error) {
+				gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+				gfs2_dinode_out(&ip->i_di, dibh->b_data);
+				brelse(dibh);
+			}
+			set_buffer_new(bh_map);
+			goto out_brelse;
+		}
+		while(--maxlen && !buffer_boundary(bh_map)) {
+			u64 eblock;
+
+			mp->mp_list[end_of_metadata]++;
+			boundary = lookup_block(ip, bh, end_of_metadata, mp, 0, &new, &eblock);
+			if (eblock != ++dblock)
+				break;
+			bh_map->b_size += (1 << inode->i_blkbits);
+			if (boundary)
+				set_buffer_boundary(bh_map);
+		}
+	}
+out_brelse:
+	brelse(bh);
+	return 0;
+}
+
+
+static inline void bmap_lock(struct inode *inode, int create)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	if (create)
+		down_write(&ip->i_rw_mutex);
+	else
+		down_read(&ip->i_rw_mutex);
+}
+
+static inline void bmap_unlock(struct inode *inode, int create)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	if (create)
+		up_write(&ip->i_rw_mutex);
+	else
+		up_read(&ip->i_rw_mutex);
+}
+
+int gfs2_block_map(struct inode *inode, u64 lblock, int create,
+		   struct buffer_head *bh)
+{
+	struct metapath mp;
+	int ret;
+
+	bmap_lock(inode, create);
+	ret = gfs2_block_pointers(inode, lblock, create, bh, &mp);
+	bmap_unlock(inode, create);
+	return ret;
+}
+
+int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
+{
+	struct metapath mp;
+	struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
+	int ret;
+	int create = *new;
+
+	BUG_ON(!extlen);
+	BUG_ON(!dblock);
+	BUG_ON(!new);
+
+	bh.b_size = 1 << (inode->i_blkbits + 5);
+	bmap_lock(inode, create);
+	ret = gfs2_block_pointers(inode, lblock, create, &bh, &mp);
+	bmap_unlock(inode, create);
+	*extlen = bh.b_size >> inode->i_blkbits;
+	*dblock = bh.b_blocknr;
+	if (buffer_new(&bh))
+		*new = 1;
+	else
+		*new = 0;
+	return ret;
+}
+
+/**
+ * recursive_scan - recursively scan through the end of a file
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @mp: the path through the metadata to the point to start
+ * @height: the height the recursion is at
+ * @block: the indirect block to look at
+ * @first: 1 if this is the first block
+ * @bc: the call to make for each piece of metadata
+ * @data: data opaque to this function to pass to @bc
+ *
+ * When this is first called @height and @block should be zero and
+ * @first should be 1.
+ *
+ * Returns: errno
+ */
+
+static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
+			  struct metapath *mp, unsigned int height,
+			  u64 block, int first, block_call_t bc,
+			  void *data)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *bh = NULL;
+	u64 *top, *bottom;
+	u64 bn;
+	int error;
+	int mh_size = sizeof(struct gfs2_meta_header);
+
+	if (!height) {
+		error = gfs2_meta_inode_buffer(ip, &bh);
+		if (error)
+			return error;
+		dibh = bh;
+
+		top = (u64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
+		bottom = (u64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
+	} else {
+		error = gfs2_meta_indirect_buffer(ip, height, block, 0, &bh);
+		if (error)
+			return error;
+
+		top = (u64 *)(bh->b_data + mh_size) +
+				  (first ? mp->mp_list[height] : 0);
+
+		bottom = (u64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
+	}
+
+	error = bc(ip, dibh, bh, top, bottom, height, data);
+	if (error)
+		goto out;
+
+	if (height < ip->i_di.di_height - 1)
+		for (; top < bottom; top++, first = 0) {
+			if (!*top)
+				continue;
+
+			bn = be64_to_cpu(*top);
+
+			error = recursive_scan(ip, dibh, mp, height + 1, bn,
+					       first, bc, data);
+			if (error)
+				break;
+		}
+
+out:
+	brelse(bh);
+	return error;
+}
+
+/**
+ * do_strip - Look for a layer a particular layer of the file and strip it off
+ * @ip: the inode
+ * @dibh: the dinode buffer
+ * @bh: A buffer of pointers
+ * @top: The first pointer in the buffer
+ * @bottom: One more than the last pointer
+ * @height: the height this buffer is at
+ * @data: a pointer to a struct strip_mine
+ *
+ * Returns: errno
+ */
+
+static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
+		    struct buffer_head *bh, u64 *top, u64 *bottom,
+		    unsigned int height, void *data)
+{
+	struct strip_mine *sm = data;
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrp_list rlist;
+	u64 bn, bstart;
+	u32 blen;
+	u64 *p;
+	unsigned int rg_blocks = 0;
+	int metadata;
+	unsigned int revokes = 0;
+	int x;
+	int error;
+
+	if (!*top)
+		sm->sm_first = 0;
+
+	if (height != sm->sm_height)
+		return 0;
+
+	if (sm->sm_first) {
+		top++;
+		sm->sm_first = 0;
+	}
+
+	metadata = (height != ip->i_di.di_height - 1);
+	if (metadata)
+		revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
+
+	error = gfs2_rindex_hold(sdp, &ip->i_alloc.al_ri_gh);
+	if (error)
+		return error;
+
+	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+	bstart = 0;
+	blen = 0;
+
+	for (p = top; p < bottom; p++) {
+		if (!*p)
+			continue;
+
+		bn = be64_to_cpu(*p);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart)
+				gfs2_rlist_add(sdp, &rlist, bstart);
+
+			bstart = bn;
+			blen = 1;
+		}
+	}
+
+	if (bstart)
+		gfs2_rlist_add(sdp, &rlist, bstart);
+	else
+		goto out; /* Nothing to do */
+
+	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+
+	for (x = 0; x < rlist.rl_rgrps; x++) {
+		struct gfs2_rgrpd *rgd;
+		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+		rg_blocks += rgd->rd_ri.ri_length;
+	}
+
+	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+	if (error)
+		goto out_rlist;
+
+	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
+				 RES_INDIRECT + RES_STATFS + RES_QUOTA,
+				 revokes);
+	if (error)
+		goto out_rg_gunlock;
+
+	down_write(&ip->i_rw_mutex);
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+
+	bstart = 0;
+	blen = 0;
+
+	for (p = top; p < bottom; p++) {
+		if (!*p)
+			continue;
+
+		bn = be64_to_cpu(*p);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart) {
+				if (metadata)
+					gfs2_free_meta(ip, bstart, blen);
+				else
+					gfs2_free_data(ip, bstart, blen);
+			}
+
+			bstart = bn;
+			blen = 1;
+		}
+
+		*p = 0;
+		if (!ip->i_di.di_blocks)
+			gfs2_consist_inode(ip);
+		ip->i_di.di_blocks--;
+	}
+	if (bstart) {
+		if (metadata)
+			gfs2_free_meta(ip, bstart, blen);
+		else
+			gfs2_free_data(ip, bstart, blen);
+	}
+
+	ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+
+	gfs2_dinode_out(&ip->i_di, dibh->b_data);
+
+	up_write(&ip->i_rw_mutex);
+
+	gfs2_trans_end(sdp);
+
+out_rg_gunlock:
+	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist:
+	gfs2_rlist_free(&rlist);
+out:
+	gfs2_glock_dq_uninit(&ip->i_alloc.al_ri_gh);
+	return error;
+}
+
+/**
+ * do_grow - Make a file look bigger than it is
+ * @ip: the inode
+ * @size: the size to set the file to
+ *
+ * Called with an exclusive lock on @ip.
+ *
+ * Returns: errno
+ */
+
+static int do_grow(struct gfs2_inode *ip, u64 size)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al;
+	struct buffer_head *dibh;
+	unsigned int h;
+	int error;
+
+	al = gfs2_alloc_get(ip);
+
+	error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto out;
+
+	error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+	if (error)
+		goto out_gunlock_q;
+
+	al->al_requested = sdp->sd_max_height + RES_DATA;
+
+	error = gfs2_inplace_reserve(ip);
+	if (error)
+		goto out_gunlock_q;
+
+	error = gfs2_trans_begin(sdp,
+			sdp->sd_max_height + al->al_rgd->rd_ri.ri_length +
+			RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
+	if (error)
+		goto out_ipres;
+
+	if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
+		if (gfs2_is_stuffed(ip)) {
+			error = gfs2_unstuff_dinode(ip, NULL);
+			if (error)
+				goto out_end_trans;
+		}
+
+		h = calc_tree_height(ip, size);
+		if (ip->i_di.di_height < h) {
+			down_write(&ip->i_rw_mutex);
+			error = build_height(&ip->i_inode, h);
+			up_write(&ip->i_rw_mutex);
+			if (error)
+				goto out_end_trans;
+		}
+	}
+
+	ip->i_di.di_size = size;
+	ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out_end_trans;
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(&ip->i_di, dibh->b_data);
+	brelse(dibh);
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_ipres:
+	gfs2_inplace_release(ip);
+out_gunlock_q:
+	gfs2_quota_unlock(ip);
+out:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+
+/**
+ * gfs2_block_truncate_page - Deal with zeroing out data for truncate
+ *
+ * This is partly borrowed from ext3.
+ */
+static int gfs2_block_truncate_page(struct address_space *mapping)
+{
+	struct inode *inode = mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	loff_t from = inode->i_size;
+	unsigned long index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize, iblock, length, pos;
+	struct buffer_head *bh;
+	struct page *page;
+	void *kaddr;
+	int err;
+
+	page = grab_cache_page(mapping, index);
+	if (!page)
+		return 0;
+
+	blocksize = inode->i_sb->s_blocksize;
+	length = blocksize - (offset & (blocksize - 1));
+	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+
+	/* Find the buffer that contains "offset" */
+	bh = page_buffers(page);
+	pos = blocksize;
+	while (offset >= pos) {
+		bh = bh->b_this_page;
+		iblock++;
+		pos += blocksize;
+	}
+
+	err = 0;
+
+	if (!buffer_mapped(bh)) {
+		gfs2_get_block(inode, iblock, bh, 0);
+		/* unmapped? It's a hole - nothing to do */
+		if (!buffer_mapped(bh))
+			goto unlock;
+	}
+
+	/* Ok, it's mapped. Make sure it's up-to-date */
+	if (PageUptodate(page))
+		set_buffer_uptodate(bh);
+
+	if (!buffer_uptodate(bh)) {
+		err = -EIO;
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		/* Uhhuh. Read error. Complain and punt. */
+		if (!buffer_uptodate(bh))
+			goto unlock;
+	}
+
+	if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+		gfs2_trans_add_bh(ip->i_gl, bh, 0);
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr + offset, 0, length);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+
+unlock:
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
+static int trunc_start(struct gfs2_inode *ip, u64 size)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *dibh;
+	int journaled = gfs2_is_jdata(ip);
+	int error;
+
+	error = gfs2_trans_begin(sdp,
+				 RES_DINODE + (journaled ? RES_JDATA : 0), 0);
+	if (error)
+		return error;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out;
+
+	if (gfs2_is_stuffed(ip)) {
+		ip->i_di.di_size = size;
+		ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + size);
+		error = 1;
+
+	} else {
+		if (size & (u64)(sdp->sd_sb.sb_bsize - 1))
+			error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
+
+		if (!error) {
+			ip->i_di.di_size = size;
+			ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+			ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
+			gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+			gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		}
+	}
+
+	brelse(dibh);
+
+out:
+	gfs2_trans_end(sdp);
+	return error;
+}
+
+static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
+{
+	unsigned int height = ip->i_di.di_height;
+	u64 lblock;
+	struct metapath mp;
+	int error;
+
+	if (!size)
+		lblock = 0;
+	else
+		lblock = (size - 1) >> GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize_shift;
+
+	find_metapath(ip, lblock, &mp);
+	gfs2_alloc_get(ip);
+
+	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto out;
+
+	while (height--) {
+		struct strip_mine sm;
+		sm.sm_first = !!size;
+		sm.sm_height = height;
+
+		error = recursive_scan(ip, NULL, &mp, 0, 0, 1, do_strip, &sm);
+		if (error)
+			break;
+	}
+
+	gfs2_quota_unhold(ip);
+
+out:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+static int trunc_end(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (error)
+		return error;
+
+	down_write(&ip->i_rw_mutex);
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out;
+
+	if (!ip->i_di.di_size) {
+		ip->i_di.di_height = 0;
+		ip->i_di.di_goal_meta =
+			ip->i_di.di_goal_data =
+			ip->i_num.no_addr;
+		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+	}
+	ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+	ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(&ip->i_di, dibh->b_data);
+	brelse(dibh);
+
+out:
+	up_write(&ip->i_rw_mutex);
+	gfs2_trans_end(sdp);
+	return error;
+}
+
+/**
+ * do_shrink - make a file smaller
+ * @ip: the inode
+ * @size: the size to make the file
+ * @truncator: function to truncate the last partial block
+ *
+ * Called with an exclusive lock on @ip.
+ *
+ * Returns: errno
+ */
+
+static int do_shrink(struct gfs2_inode *ip, u64 size)
+{
+	int error;
+
+	error = trunc_start(ip, size);
+	if (error < 0)
+		return error;
+	if (error > 0)
+		return 0;
+
+	error = trunc_dealloc(ip, size);
+	if (!error)
+		error = trunc_end(ip);
+
+	return error;
+}
+
+/**
+ * gfs2_truncatei - make a file a given size
+ * @ip: the inode
+ * @size: the size to make the file
+ * @truncator: function to truncate the last partial block
+ *
+ * The file size can grow, shrink, or stay the same size.
+ *
+ * Returns: errno
+ */
+
+int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
+{
+	int error;
+
+	if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_di.di_mode)))
+		return -EINVAL;
+
+	if (size > ip->i_di.di_size)
+		error = do_grow(ip, size);
+	else
+		error = do_shrink(ip, size);
+
+	return error;
+}
+
+int gfs2_truncatei_resume(struct gfs2_inode *ip)
+{
+	int error;
+	error = trunc_dealloc(ip, ip->i_di.di_size);
+	if (!error)
+		error = trunc_end(ip);
+	return error;
+}
+
+int gfs2_file_dealloc(struct gfs2_inode *ip)
+{
+	return trunc_dealloc(ip, 0);
+}
+
+/**
+ * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
+ * @ip: the file
+ * @len: the number of bytes to be written to the file
+ * @data_blocks: returns the number of data blocks required
+ * @ind_blocks: returns the number of indirect blocks required
+ *
+ */
+
+void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
+			    unsigned int *data_blocks, unsigned int *ind_blocks)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	unsigned int tmp;
+
+	if (gfs2_is_dir(ip)) {
+		*data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
+		*ind_blocks = 3 * (sdp->sd_max_jheight - 1);
+	} else {
+		*data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
+		*ind_blocks = 3 * (sdp->sd_max_height - 1);
+	}
+
+	for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
+		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+		*ind_blocks += tmp;
+	}
+}
+
+/**
+ * gfs2_write_alloc_required - figure out if a write will require an allocation
+ * @ip: the file being written to
+ * @offset: the offset to write to
+ * @len: the number of bytes being written
+ * @alloc_required: set to 1 if an alloc is required, 0 otherwise
+ *
+ * Returns: errno
+ */
+
+int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+			      unsigned int len, int *alloc_required)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	u64 lblock, lblock_stop, dblock;
+	u32 extlen;
+	int new = 0;
+	int error = 0;
+
+	*alloc_required = 0;
+
+	if (!len)
+		return 0;
+
+	if (gfs2_is_stuffed(ip)) {
+		if (offset + len >
+		    sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
+			*alloc_required = 1;
+		return 0;
+	}
+
+	if (gfs2_is_dir(ip)) {
+		unsigned int bsize = sdp->sd_jbsize;
+		lblock = offset;
+		do_div(lblock, bsize);
+		lblock_stop = offset + len + bsize - 1;
+		do_div(lblock_stop, bsize);
+	} else {
+		unsigned int shift = sdp->sd_sb.sb_bsize_shift;
+		lblock = offset >> shift;
+		lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
+	}
+
+	for (; lblock < lblock_stop; lblock += extlen) {
+		error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
+		if (error)
+			return error;
+
+		if (!dblock) {
+			*alloc_required = 1;
+			return 0;
+		}
+	}
+
+	return 0;
+}
+
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
new file mode 100644
index 000000000000..ac2fd04370dc
--- /dev/null
+++ b/fs/gfs2/bmap.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __BMAP_DOT_H__
+#define __BMAP_DOT_H__
+
+struct inode;
+struct gfs2_inode;
+struct page;
+
+int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
+int gfs2_block_map(struct inode *inode, u64 lblock, int create, struct buffer_head *bh);
+int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
+
+int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
+int gfs2_truncatei_resume(struct gfs2_inode *ip);
+int gfs2_file_dealloc(struct gfs2_inode *ip);
+
+void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
+			    unsigned int *data_blocks,
+			    unsigned int *ind_blocks);
+int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
+			      unsigned int len, int *alloc_required);
+
+#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
new file mode 100644
index 000000000000..cab1f68d4685
--- /dev/null
+++ b/fs/gfs2/daemon.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "daemon.h"
+#include "glock.h"
+#include "log.h"
+#include "quota.h"
+#include "recovery.h"
+#include "super.h"
+#include "util.h"
+
+/* This uses schedule_timeout() instead of msleep() because it's good for
+   the daemons to wake up more often than the timeout when unmounting so
+   the user's unmount doesn't sit there forever.
+
+   The kthread functions used to start these daemons block and flush signals. */
+
+/**
+ * gfs2_scand - Look for cached glocks and inodes to toss from memory
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * One of these daemons runs, finding candidates to add to sd_reclaim_list.
+ * See gfs2_glockd()
+ */
+
+int gfs2_scand(void *data)
+{
+	struct gfs2_sbd *sdp = data;
+	unsigned long t;
+
+	while (!kthread_should_stop()) {
+		gfs2_scand_internal(sdp);
+		t = gfs2_tune_get(sdp, gt_scand_secs) * HZ;
+		schedule_timeout_interruptible(t);
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_glockd - Reclaim unused glock structures
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
+ * Number of daemons can be set by user, with num_glockd mount option.
+ */
+
+int gfs2_glockd(void *data)
+{
+	struct gfs2_sbd *sdp = data;
+
+	while (!kthread_should_stop()) {
+		while (atomic_read(&sdp->sd_reclaim_count))
+			gfs2_reclaim_glock(sdp);
+
+		wait_event_interruptible(sdp->sd_reclaim_wq,
+					 (atomic_read(&sdp->sd_reclaim_count) ||
+					 kthread_should_stop()));
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_recoverd - Recover dead machine's journals
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+
+int gfs2_recoverd(void *data)
+{
+	struct gfs2_sbd *sdp = data;
+	unsigned long t;
+
+	while (!kthread_should_stop()) {
+		gfs2_check_journals(sdp);
+		t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
+		schedule_timeout_interruptible(t);
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * Also, periodically check to make sure that we're using the most recent
+ * journal index.
+ */
+
+int gfs2_logd(void *data)
+{
+	struct gfs2_sbd *sdp = data;
+	struct gfs2_holder ji_gh;
+	unsigned long t;
+
+	while (!kthread_should_stop()) {
+		/* Advance the log tail */
+
+		t = sdp->sd_log_flush_time +
+		    gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
+
+		gfs2_ail1_empty(sdp, DIO_ALL);
+
+		if (time_after_eq(jiffies, t)) {
+			gfs2_log_flush(sdp, NULL);
+			sdp->sd_log_flush_time = jiffies;
+		}
+
+		/* Check for latest journal index */
+
+		t = sdp->sd_jindex_refresh_time +
+		    gfs2_tune_get(sdp, gt_jindex_refresh_secs) * HZ;
+
+		if (time_after_eq(jiffies, t)) {
+			if (!gfs2_jindex_hold(sdp, &ji_gh))
+				gfs2_glock_dq_uninit(&ji_gh);
+			sdp->sd_jindex_refresh_time = jiffies;
+		}
+
+		t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
+		schedule_timeout_interruptible(t);
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_quotad - Write cached quota changes into the quota file
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+
+int gfs2_quotad(void *data)
+{
+	struct gfs2_sbd *sdp = data;
+	unsigned long t;
+	int error;
+
+	while (!kthread_should_stop()) {
+		/* Update the master statfs file */
+
+		t = sdp->sd_statfs_sync_time +
+		    gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
+
+		if (time_after_eq(jiffies, t)) {
+			error = gfs2_statfs_sync(sdp);
+			if (error &&
+			    error != -EROFS &&
+			    !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+				fs_err(sdp, "quotad: (1) error=%d\n", error);
+			sdp->sd_statfs_sync_time = jiffies;
+		}
+
+		/* Update quota file */
+
+		t = sdp->sd_quota_sync_time +
+		    gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
+
+		if (time_after_eq(jiffies, t)) {
+			error = gfs2_quota_sync(sdp);
+			if (error &&
+			    error != -EROFS &&
+			    !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+				fs_err(sdp, "quotad: (2) error=%d\n", error);
+			sdp->sd_quota_sync_time = jiffies;
+		}
+
+		gfs2_quota_scan(sdp);
+
+		t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
+		schedule_timeout_interruptible(t);
+	}
+
+	return 0;
+}
+
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
new file mode 100644
index 000000000000..801007120fb2
--- /dev/null
+++ b/fs/gfs2/daemon.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __DAEMON_DOT_H__
+#define __DAEMON_DOT_H__
+
+int gfs2_scand(void *data);
+int gfs2_glockd(void *data);
+int gfs2_recoverd(void *data);
+int gfs2_logd(void *data);
+int gfs2_quotad(void *data);
+
+#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
new file mode 100644
index 000000000000..e24af28b1a12
--- /dev/null
+++ b/fs/gfs2/dir.c
@@ -0,0 +1,1957 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+/*
+ * Implements Extendible Hashing as described in:
+ *   "Extendible Hashing" by Fagin, et al in
+ *     __ACM Trans. on Database Systems__, Sept 1979.
+ *
+ *
+ * Here's the layout of dirents which is essentially the same as that of ext2
+ * within a single block. The field de_name_len is the number of bytes
+ * actually required for the name (no null terminator). The field de_rec_len
+ * is the number of bytes allocated to the dirent. The offset of the next
+ * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is
+ * deleted, the preceding dirent inherits its allocated space, ie
+ * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained
+ * by adding de_rec_len to the current dirent, this essentially causes the
+ * deleted dirent to get jumped over when iterating through all the dirents.
+ *
+ * When deleting the first dirent in a block, there is no previous dirent so
+ * the field de_ino is set to zero to designate it as deleted. When allocating
+ * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the
+ * first dirent has (de_ino == 0) and de_rec_len is large enough, this first
+ * dirent is allocated. Otherwise it must go through all the 'used' dirents
+ * searching for one in which the amount of total space minus the amount of
+ * used space will provide enough space for the new dirent.
+ *
+ * There are two types of blocks in which dirents reside. In a stuffed dinode,
+ * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of
+ * the block.  In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
+ * beginning of the leaf block. The dirents reside in leaves when
+ *
+ * dip->i_di.di_flags & GFS2_DIF_EXHASH is true
+ *
+ * Otherwise, the dirents are "linear", within a single stuffed dinode block.
+ *
+ * When the dirents are in leaves, the actual contents of the directory file are
+ * used as an array of 64-bit block pointers pointing to the leaf blocks. The
+ * dirents are NOT in the directory file itself. There can be more than one
+ * block pointer in the array that points to the same leaf. In fact, when a
+ * directory is first converted from linear to exhash, all of the pointers
+ * point to the same leaf.
+ *
+ * When a leaf is completely full, the size of the hash table can be
+ * doubled unless it is already at the maximum size which is hard coded into
+ * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list,
+ * but never before the maximum hash table size has been reached.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/buffer_head.h>
+#include <linux/sort.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/vmalloc.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "bmap.h"
+#include "util.h"
+
+#define IS_LEAF     1 /* Hashed (leaf) directory */
+#define IS_DINODE   2 /* Linear (stuffed dinode block) directory */
+
+#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
+#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
+
+typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
+			    u64 leaf_no, void *data);
+typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
+			    const struct qstr *name, void *opaque);
+
+
+int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+			    struct buffer_head **bhp)
+{
+	struct buffer_head *bh;
+
+	bh = gfs2_meta_new(ip->i_gl, block);
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
+	gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
+	*bhp = bh;
+	return 0;
+}
+
+static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block,
+					struct buffer_head **bhp)
+{
+	struct buffer_head *bh;
+	int error;
+
+	error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh);
+	if (error)
+		return error;
+	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
+		brelse(bh);
+		return -EIO;
+	}
+	*bhp = bh;
+	return 0;
+}
+
+static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
+				  unsigned int offset, unsigned int size)
+{
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		return error;
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
+	if (ip->i_di.di_size < offset + size)
+		ip->i_di.di_size = offset + size;
+	ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+	gfs2_dinode_out(&ip->i_di, dibh->b_data);
+
+	brelse(dibh);
+
+	return size;
+}
+
+
+
+/**
+ * gfs2_dir_write_data - Write directory information to the inode
+ * @ip: The GFS2 inode
+ * @buf: The buffer containing information to be written
+ * @offset: The file offset to start writing at
+ * @size: The amount of data to write
+ *
+ * Returns: The number of bytes correctly written or error code
+ */
+static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
+			       u64 offset, unsigned int size)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *dibh;
+	u64 lblock, dblock;
+	u32 extlen = 0;
+	unsigned int o;
+	int copied = 0;
+	int error = 0;
+
+	if (!size)
+		return 0;
+
+	if (gfs2_is_stuffed(ip) &&
+	    offset + size <= sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode))
+		return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset,
+					      size);
+
+	if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
+		return -EINVAL;
+
+	if (gfs2_is_stuffed(ip)) {
+		error = gfs2_unstuff_dinode(ip, NULL);
+		if (error)
+			return error;
+	}
+
+	lblock = offset;
+	o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
+
+	while (copied < size) {
+		unsigned int amount;
+		struct buffer_head *bh;
+		int new = 0;
+
+		amount = size - copied;
+		if (amount > sdp->sd_sb.sb_bsize - o)
+			amount = sdp->sd_sb.sb_bsize - o;
+
+		if (!extlen) {
+			new = 1;
+			error = gfs2_extent_map(&ip->i_inode, lblock, &new,
+						&dblock, &extlen);
+			if (error)
+				goto fail;
+			error = -EIO;
+			if (gfs2_assert_withdraw(sdp, dblock))
+				goto fail;
+		}
+
+		if (amount == sdp->sd_jbsize || new)
+			error = gfs2_dir_get_new_buffer(ip, dblock, &bh);
+		else
+			error = gfs2_dir_get_existing_buffer(ip, dblock, &bh);
+
+		if (error)
+			goto fail;
+
+		gfs2_trans_add_bh(ip->i_gl, bh, 1);
+		memcpy(bh->b_data + o, buf, amount);
+		brelse(bh);
+
+		buf += amount;
+		copied += amount;
+		lblock++;
+		dblock++;
+		extlen--;
+
+		o = sizeof(struct gfs2_meta_header);
+	}
+
+out:
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		return error;
+
+	if (ip->i_di.di_size < offset + copied)
+		ip->i_di.di_size = offset + copied;
+	ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(&ip->i_di, dibh->b_data);
+	brelse(dibh);
+
+	return copied;
+fail:
+	if (copied)
+		goto out;
+	return error;
+}
+
+static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, char *buf,
+				 u64 offset, unsigned int size)
+{
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		offset += sizeof(struct gfs2_dinode);
+		memcpy(buf, dibh->b_data + offset, size);
+		brelse(dibh);
+	}
+
+	return (error) ? error : size;
+}
+
+
+/**
+ * gfs2_dir_read_data - Read a data from a directory inode
+ * @ip: The GFS2 Inode
+ * @buf: The buffer to place result into
+ * @offset: File offset to begin jdata_readng from
+ * @size: Amount of data to transfer
+ *
+ * Returns: The amount of data actually copied or the error
+ */
+static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
+			      unsigned int size, unsigned ra)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	u64 lblock, dblock;
+	u32 extlen = 0;
+	unsigned int o;
+	int copied = 0;
+	int error = 0;
+
+	if (offset >= ip->i_di.di_size)
+		return 0;
+
+	if (offset + size > ip->i_di.di_size)
+		size = ip->i_di.di_size - offset;
+
+	if (!size)
+		return 0;
+
+	if (gfs2_is_stuffed(ip))
+		return gfs2_dir_read_stuffed(ip, buf, offset, size);
+
+	if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip)))
+		return -EINVAL;
+
+	lblock = offset;
+	o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header);
+
+	while (copied < size) {
+		unsigned int amount;
+		struct buffer_head *bh;
+		int new;
+
+		amount = size - copied;
+		if (amount > sdp->sd_sb.sb_bsize - o)
+			amount = sdp->sd_sb.sb_bsize - o;
+
+		if (!extlen) {
+			new = 0;
+			error = gfs2_extent_map(&ip->i_inode, lblock, &new,
+						&dblock, &extlen);
+			if (error || !dblock)
+				goto fail;
+			BUG_ON(extlen < 1);
+			if (!ra)
+				extlen = 1;
+			bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
+		} else {
+			error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
+			if (error)
+				goto fail;
+		}
+		error = gfs2_metatype_check(sdp, bh, GFS2_METATYPE_JD);
+		if (error) {
+			brelse(bh);
+			goto fail;
+		}
+		dblock++;
+		extlen--;
+		memcpy(buf, bh->b_data + o, amount);
+		brelse(bh);
+		buf += amount;
+		copied += amount;
+		lblock++;
+		o = sizeof(struct gfs2_meta_header);
+	}
+
+	return copied;
+fail:
+	return (copied) ? copied : error;
+}
+
+static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent,
+				     const struct qstr *name, int ret)
+{
+	if (dent->de_inum.no_addr != 0 &&
+	    be32_to_cpu(dent->de_hash) == name->hash &&
+	    be16_to_cpu(dent->de_name_len) == name->len &&
+	    memcmp(dent+1, name->name, name->len) == 0)
+		return ret;
+	return 0;
+}
+
+static int gfs2_dirent_find(const struct gfs2_dirent *dent,
+			    const struct qstr *name,
+			    void *opaque)
+{
+	return __gfs2_dirent_find(dent, name, 1);
+}
+
+static int gfs2_dirent_prev(const struct gfs2_dirent *dent,
+			    const struct qstr *name,
+			    void *opaque)
+{
+	return __gfs2_dirent_find(dent, name, 2);
+}
+
+/*
+ * name->name holds ptr to start of block.
+ * name->len holds size of block.
+ */
+static int gfs2_dirent_last(const struct gfs2_dirent *dent,
+			    const struct qstr *name,
+			    void *opaque)
+{
+	const char *start = name->name;
+	const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len);
+	if (name->len == (end - start))
+		return 1;
+	return 0;
+}
+
+static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
+				  const struct qstr *name,
+				  void *opaque)
+{
+	unsigned required = GFS2_DIRENT_SIZE(name->len);
+	unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
+	unsigned totlen = be16_to_cpu(dent->de_rec_len);
+
+	if (!dent->de_inum.no_addr)
+		actual = GFS2_DIRENT_SIZE(0);
+	if (totlen - actual >= required)
+		return 1;
+	return 0;
+}
+
+struct dirent_gather {
+	const struct gfs2_dirent **pdent;
+	unsigned offset;
+};
+
+static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
+			      const struct qstr *name,
+			      void *opaque)
+{
+	struct dirent_gather *g = opaque;
+	if (dent->de_inum.no_addr) {
+		g->pdent[g->offset++] = dent;
+	}
+	return 0;
+}
+
+/*
+ * Other possible things to check:
+ * - Inode located within filesystem size (and on valid block)
+ * - Valid directory entry type
+ * Not sure how heavy-weight we want to make this... could also check
+ * hash is correct for example, but that would take a lot of extra time.
+ * For now the most important thing is to check that the various sizes
+ * are correct.
+ */
+static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
+			     unsigned int size, unsigned int len, int first)
+{
+	const char *msg = "gfs2_dirent too small";
+	if (unlikely(size < sizeof(struct gfs2_dirent)))
+		goto error;
+	msg = "gfs2_dirent misaligned";
+	if (unlikely(offset & 0x7))
+		goto error;
+	msg = "gfs2_dirent points beyond end of block";
+	if (unlikely(offset + size > len))
+		goto error;
+	msg = "zero inode number";
+	if (unlikely(!first && !dent->de_inum.no_addr))
+		goto error;
+	msg = "name length is greater than space in dirent";
+	if (dent->de_inum.no_addr &&
+	    unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) >
+		     size))
+		goto error;
+	return 0;
+error:
+	printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg,
+	       first ? "first in block" : "not first in block");
+	return -EIO;
+}
+
+static int gfs2_dirent_offset(const void *buf)
+{
+	const struct gfs2_meta_header *h = buf;
+	int offset;
+
+	BUG_ON(buf == NULL);
+
+	switch(be32_to_cpu(h->mh_type)) {
+	case GFS2_METATYPE_LF:
+		offset = sizeof(struct gfs2_leaf);
+		break;
+	case GFS2_METATYPE_DI:
+		offset = sizeof(struct gfs2_dinode);
+		break;
+	default:
+		goto wrong_type;
+	}
+	return offset;
+wrong_type:
+	printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n",
+	       be32_to_cpu(h->mh_type));
+	return -1;
+}
+
+static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf,
+					    unsigned int len, gfs2_dscan_t scan,
+					    const struct qstr *name,
+					    void *opaque)
+{
+	struct gfs2_dirent *dent, *prev;
+	unsigned offset;
+	unsigned size;
+	int ret = 0;
+
+	ret = gfs2_dirent_offset(buf);
+	if (ret < 0)
+		goto consist_inode;
+
+	offset = ret;
+	prev = NULL;
+	dent = buf + offset;
+	size = be16_to_cpu(dent->de_rec_len);
+	if (gfs2_check_dirent(dent, offset, size, len, 1))
+		goto consist_inode;
+	do {
+		ret = scan(dent, name, opaque);
+		if (ret)
+			break;
+		offset += size;
+		if (offset == len)
+			break;
+		prev = dent;
+		dent = buf + offset;
+		size = be16_to_cpu(dent->de_rec_len);
+		if (gfs2_check_dirent(dent, offset, size, len, 0))
+			goto consist_inode;
+	} while(1);
+
+	switch(ret) {
+	case 0:
+		return NULL;
+	case 1:
+		return dent;
+	case 2:
+		return prev ? prev : dent;
+	default:
+		BUG_ON(ret > 0);
+		return ERR_PTR(ret);
+	}
+
+consist_inode:
+	gfs2_consist_inode(GFS2_I(inode));
+	return ERR_PTR(-EIO);
+}
+
+
+/**
+ * dirent_first - Return the first dirent
+ * @dip: the directory
+ * @bh: The buffer
+ * @dent: Pointer to list of dirents
+ *
+ * return first dirent whether bh points to leaf or stuffed dinode
+ *
+ * Returns: IS_LEAF, IS_DINODE, or -errno
+ */
+
+static int dirent_first(struct gfs2_inode *dip, struct buffer_head *bh,
+			struct gfs2_dirent **dent)
+{
+	struct gfs2_meta_header *h = (struct gfs2_meta_header *)bh->b_data;
+
+	if (be32_to_cpu(h->mh_type) == GFS2_METATYPE_LF) {
+		if (gfs2_meta_check(GFS2_SB(&dip->i_inode), bh))
+			return -EIO;
+		*dent = (struct gfs2_dirent *)(bh->b_data +
+					       sizeof(struct gfs2_leaf));
+		return IS_LEAF;
+	} else {
+		if (gfs2_metatype_check(GFS2_SB(&dip->i_inode), bh, GFS2_METATYPE_DI))
+			return -EIO;
+		*dent = (struct gfs2_dirent *)(bh->b_data +
+					       sizeof(struct gfs2_dinode));
+		return IS_DINODE;
+	}
+}
+
+static int dirent_check_reclen(struct gfs2_inode *dip,
+			       const struct gfs2_dirent *d, const void *end_p)
+{
+	const void *ptr = d;
+	u16 rec_len = be16_to_cpu(d->de_rec_len);
+
+	if (unlikely(rec_len < sizeof(struct gfs2_dirent)))
+		goto broken;
+	ptr += rec_len;
+	if (ptr < end_p)
+		return rec_len;
+	if (ptr == end_p)
+		return -ENOENT;
+broken:
+	gfs2_consist_inode(dip);
+	return -EIO;
+}
+
+/**
+ * dirent_next - Next dirent
+ * @dip: the directory
+ * @bh: The buffer
+ * @dent: Pointer to list of dirents
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh,
+		       struct gfs2_dirent **dent)
+{
+	struct gfs2_dirent *cur = *dent, *tmp;
+	char *bh_end = bh->b_data + bh->b_size;
+	int ret;
+
+	ret = dirent_check_reclen(dip, cur, bh_end);
+	if (ret < 0)
+		return ret;
+
+	tmp = (void *)cur + ret;
+	ret = dirent_check_reclen(dip, tmp, bh_end);
+	if (ret == -EIO)
+		return ret;
+
+        /* Only the first dent could ever have de_inum.no_addr == 0 */
+	if (!tmp->de_inum.no_addr) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
+
+	*dent = tmp;
+	return 0;
+}
+
+/**
+ * dirent_del - Delete a dirent
+ * @dip: The GFS2 inode
+ * @bh: The buffer
+ * @prev: The previous dirent
+ * @cur: The current dirent
+ *
+ */
+
+static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
+		       struct gfs2_dirent *prev, struct gfs2_dirent *cur)
+{
+	u16 cur_rec_len, prev_rec_len;
+
+	if (!cur->de_inum.no_addr) {
+		gfs2_consist_inode(dip);
+		return;
+	}
+
+	gfs2_trans_add_bh(dip->i_gl, bh, 1);
+
+	/* If there is no prev entry, this is the first entry in the block.
+	   The de_rec_len is already as big as it needs to be.  Just zero
+	   out the inode number and return.  */
+
+	if (!prev) {
+		cur->de_inum.no_addr = 0;	/* No endianess worries */
+		return;
+	}
+
+	/*  Combine this dentry with the previous one.  */
+
+	prev_rec_len = be16_to_cpu(prev->de_rec_len);
+	cur_rec_len = be16_to_cpu(cur->de_rec_len);
+
+	if ((char *)prev + prev_rec_len != (char *)cur)
+		gfs2_consist_inode(dip);
+	if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size)
+		gfs2_consist_inode(dip);
+
+	prev_rec_len += cur_rec_len;
+	prev->de_rec_len = cpu_to_be16(prev_rec_len);
+}
+
+/*
+ * Takes a dent from which to grab space as an argument. Returns the
+ * newly created dent.
+ */
+static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
+					    struct gfs2_dirent *dent,
+					    const struct qstr *name,
+					    struct buffer_head *bh)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_dirent *ndent;
+	unsigned offset = 0, totlen;
+
+	if (dent->de_inum.no_addr)
+		offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
+	totlen = be16_to_cpu(dent->de_rec_len);
+	BUG_ON(offset + name->len > totlen);
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	ndent = (struct gfs2_dirent *)((char *)dent + offset);
+	dent->de_rec_len = cpu_to_be16(offset);
+	gfs2_qstr2dirent(name, totlen - offset, ndent);
+	return ndent;
+}
+
+static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
+					     struct buffer_head *bh,
+					     const struct qstr *name)
+{
+	struct gfs2_dirent *dent;
+	dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+				gfs2_dirent_find_space, name, NULL);
+	if (!dent || IS_ERR(dent))
+		return dent;
+	return gfs2_init_dirent(inode, dent, name, bh);
+}
+
+static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
+		    struct buffer_head **bhp)
+{
+	int error;
+
+	error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
+	if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
+		/* printk(KERN_INFO "block num=%llu\n", leaf_no); */
+		error = -EIO;
+	}
+
+	return error;
+}
+
+/**
+ * get_leaf_nr - Get a leaf number associated with the index
+ * @dip: The GFS2 inode
+ * @index:
+ * @leaf_out:
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
+		       u64 *leaf_out)
+{
+	u64 leaf_no;
+	int error;
+
+	error = gfs2_dir_read_data(dip, (char *)&leaf_no,
+				    index * sizeof(u64),
+				    sizeof(u64), 0);
+	if (error != sizeof(u64))
+		return (error < 0) ? error : -EIO;
+
+	*leaf_out = be64_to_cpu(leaf_no);
+
+	return 0;
+}
+
+static int get_first_leaf(struct gfs2_inode *dip, u32 index,
+			  struct buffer_head **bh_out)
+{
+	u64 leaf_no;
+	int error;
+
+	error = get_leaf_nr(dip, index, &leaf_no);
+	if (!error)
+		error = get_leaf(dip, leaf_no, bh_out);
+
+	return error;
+}
+
+static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
+					      const struct qstr *name,
+					      gfs2_dscan_t scan,
+					      struct buffer_head **pbh)
+{
+	struct buffer_head *bh;
+	struct gfs2_dirent *dent;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	int error;
+
+	if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+		struct gfs2_leaf *leaf;
+		unsigned hsize = 1 << ip->i_di.di_depth;
+		unsigned index;
+		u64 ln;
+		if (hsize * sizeof(u64) != ip->i_di.di_size) {
+			gfs2_consist_inode(ip);
+			return ERR_PTR(-EIO);
+		}
+
+		index = name->hash >> (32 - ip->i_di.di_depth);
+		error = get_first_leaf(ip, index, &bh);
+		if (error)
+			return ERR_PTR(error);
+		do {
+			dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+						scan, name, NULL);
+			if (dent)
+				goto got_dent;
+			leaf = (struct gfs2_leaf *)bh->b_data;
+			ln = be64_to_cpu(leaf->lf_next);
+			brelse(bh);
+			if (!ln)
+				break;
+
+			error = get_leaf(ip, ln, &bh);
+		} while(!error);
+
+		return error ? ERR_PTR(error) : NULL;
+	}
+
+
+	error = gfs2_meta_inode_buffer(ip, &bh);
+	if (error)
+		return ERR_PTR(error);
+	dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL);
+got_dent:
+	if (unlikely(dent == NULL || IS_ERR(dent))) {
+		brelse(bh);
+		bh = NULL;
+	}
+	*pbh = bh;
+	return dent;
+}
+
+static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	u64 bn = gfs2_alloc_meta(ip);
+	struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
+	struct gfs2_leaf *leaf;
+	struct gfs2_dirent *dent;
+	struct qstr name = { .name = "", .len = 0, .hash = 0 };
+	if (!bh)
+		return NULL;
+
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
+	leaf = (struct gfs2_leaf *)bh->b_data;
+	leaf->lf_depth = cpu_to_be16(depth);
+	leaf->lf_entries = 0;
+	leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
+	leaf->lf_next = 0;
+	memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
+	dent = (struct gfs2_dirent *)(leaf+1);
+	gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
+	*pbh = bh;
+	return leaf;
+}
+
+/**
+ * dir_make_exhash - Convert a stuffed directory into an ExHash directory
+ * @dip: The GFS2 inode
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+static int dir_make_exhash(struct inode *inode)
+{
+	struct gfs2_inode *dip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_dirent *dent;
+	struct qstr args;
+	struct buffer_head *bh, *dibh;
+	struct gfs2_leaf *leaf;
+	int y;
+	u32 x;
+	u64 *lp, bn;
+	int error;
+
+	error = gfs2_meta_inode_buffer(dip, &dibh);
+	if (error)
+		return error;
+
+	/*  Turn over a new leaf  */
+
+	leaf = new_leaf(inode, &bh, 0);
+	if (!leaf)
+		return -ENOSPC;
+	bn = bh->b_blocknr;
+
+	gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
+	leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
+
+	/*  Copy dirents  */
+
+	gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh,
+			     sizeof(struct gfs2_dinode));
+
+	/*  Find last entry  */
+
+	x = 0;
+	args.len = bh->b_size - sizeof(struct gfs2_dinode) +
+		   sizeof(struct gfs2_leaf);
+	args.name = bh->b_data;
+	dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size,
+				gfs2_dirent_last, &args, NULL);
+	if (!dent) {
+		brelse(bh);
+		brelse(dibh);
+		return -EIO;
+	}
+	if (IS_ERR(dent)) {
+		brelse(bh);
+		brelse(dibh);
+		return PTR_ERR(dent);
+	}
+
+	/*  Adjust the last dirent's record length
+	   (Remember that dent still points to the last entry.)  */
+
+	dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) +
+		sizeof(struct gfs2_dinode) -
+		sizeof(struct gfs2_leaf));
+
+	brelse(bh);
+
+	/*  We're done with the new leaf block, now setup the new
+	    hash table.  */
+
+	gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+
+	lp = (u64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
+
+	for (x = sdp->sd_hash_ptrs; x--; lp++)
+		*lp = cpu_to_be64(bn);
+
+	dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
+	dip->i_di.di_blocks++;
+	dip->i_di.di_flags |= GFS2_DIF_EXHASH;
+	dip->i_di.di_payload_format = 0;
+
+	for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
+	dip->i_di.di_depth = y;
+
+	gfs2_dinode_out(&dip->i_di, dibh->b_data);
+
+	brelse(dibh);
+
+	return 0;
+}
+
+/**
+ * dir_split_leaf - Split a leaf block into two
+ * @dip: The GFS2 inode
+ * @index:
+ * @leaf_no:
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+static int dir_split_leaf(struct inode *inode, const struct qstr *name)
+{
+	struct gfs2_inode *dip = GFS2_I(inode);
+	struct buffer_head *nbh, *obh, *dibh;
+	struct gfs2_leaf *nleaf, *oleaf;
+	struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new;
+	u32 start, len, half_len, divider;
+	u64 bn, *lp, leaf_no;
+	u32 index;
+	int x, moved = 0;
+	int error;
+
+	index = name->hash >> (32 - dip->i_di.di_depth);
+	error = get_leaf_nr(dip, index, &leaf_no);
+	if (error)
+		return error;
+
+	/*  Get the old leaf block  */
+	error = get_leaf(dip, leaf_no, &obh);
+	if (error)
+		return error;
+
+	oleaf = (struct gfs2_leaf *)obh->b_data;
+	if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) {
+		brelse(obh);
+		return 1; /* can't split */
+	}
+
+	gfs2_trans_add_bh(dip->i_gl, obh, 1);
+
+	nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
+	if (!nleaf) {
+		brelse(obh);
+		return -ENOSPC;
+	}
+	bn = nbh->b_blocknr;
+
+	/*  Compute the start and len of leaf pointers in the hash table.  */
+	len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth));
+	half_len = len >> 1;
+	if (!half_len) {
+		printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index);
+		gfs2_consist_inode(dip);
+		error = -EIO;
+		goto fail_brelse;
+	}
+
+	start = (index & ~(len - 1));
+
+	/* Change the pointers.
+	   Don't bother distinguishing stuffed from non-stuffed.
+	   This code is complicated enough already. */
+	lp = kmalloc(half_len * sizeof(u64), GFP_NOFS | __GFP_NOFAIL);
+	/*  Change the pointers  */
+	for (x = 0; x < half_len; x++)
+		lp[x] = cpu_to_be64(bn);
+
+	error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(u64),
+				    half_len * sizeof(u64));
+	if (error != half_len * sizeof(u64)) {
+		if (error >= 0)
+			error = -EIO;
+		goto fail_lpfree;
+	}
+
+	kfree(lp);
+
+	/*  Compute the divider  */
+	divider = (start + half_len) << (32 - dip->i_di.di_depth);
+
+	/*  Copy the entries  */
+	dirent_first(dip, obh, &dent);
+
+	do {
+		next = dent;
+		if (dirent_next(dip, obh, &next))
+			next = NULL;
+
+		if (dent->de_inum.no_addr &&
+		    be32_to_cpu(dent->de_hash) < divider) {
+			struct qstr str;
+			str.name = (char*)(dent+1);
+			str.len = be16_to_cpu(dent->de_name_len);
+			str.hash = be32_to_cpu(dent->de_hash);
+			new = gfs2_dirent_alloc(inode, nbh, &str);
+			if (IS_ERR(new)) {
+				error = PTR_ERR(new);
+				break;
+			}
+
+			new->de_inum = dent->de_inum; /* No endian worries */
+			new->de_type = dent->de_type; /* No endian worries */
+			nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1);
+
+			dirent_del(dip, obh, prev, dent);
+
+			if (!oleaf->lf_entries)
+				gfs2_consist_inode(dip);
+			oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1);
+
+			if (!prev)
+				prev = dent;
+
+			moved = 1;
+		} else {
+			prev = dent;
+		}
+		dent = next;
+	} while (dent);
+
+	oleaf->lf_depth = nleaf->lf_depth;
+
+	error = gfs2_meta_inode_buffer(dip, &dibh);
+	if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
+		dip->i_di.di_blocks++;
+		gfs2_dinode_out(&dip->i_di, dibh->b_data);
+		brelse(dibh);
+	}
+
+	brelse(obh);
+	brelse(nbh);
+
+	return error;
+
+fail_lpfree:
+	kfree(lp);
+
+fail_brelse:
+	brelse(obh);
+	brelse(nbh);
+	return error;
+}
+
+/**
+ * dir_double_exhash - Double size of ExHash table
+ * @dip: The GFS2 dinode
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+static int dir_double_exhash(struct gfs2_inode *dip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct buffer_head *dibh;
+	u32 hsize;
+	u64 *buf;
+	u64 *from, *to;
+	u64 block;
+	int x;
+	int error = 0;
+
+	hsize = 1 << dip->i_di.di_depth;
+	if (hsize * sizeof(u64) != dip->i_di.di_size) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
+
+	/*  Allocate both the "from" and "to" buffers in one big chunk  */
+
+	buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL);
+
+	for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
+		error = gfs2_dir_read_data(dip, (char *)buf,
+					    block * sdp->sd_hash_bsize,
+					    sdp->sd_hash_bsize, 1);
+		if (error != sdp->sd_hash_bsize) {
+			if (error >= 0)
+				error = -EIO;
+			goto fail;
+		}
+
+		from = buf;
+		to = (u64 *)((char *)buf + sdp->sd_hash_bsize);
+
+		for (x = sdp->sd_hash_ptrs; x--; from++) {
+			*to++ = *from;	/*  No endianess worries  */
+			*to++ = *from;
+		}
+
+		error = gfs2_dir_write_data(dip,
+					     (char *)buf + sdp->sd_hash_bsize,
+					     block * sdp->sd_sb.sb_bsize,
+					     sdp->sd_sb.sb_bsize);
+		if (error != sdp->sd_sb.sb_bsize) {
+			if (error >= 0)
+				error = -EIO;
+			goto fail;
+		}
+	}
+
+	kfree(buf);
+
+	error = gfs2_meta_inode_buffer(dip, &dibh);
+	if (!gfs2_assert_withdraw(sdp, !error)) {
+		dip->i_di.di_depth++;
+		gfs2_dinode_out(&dip->i_di, dibh->b_data);
+		brelse(dibh);
+	}
+
+	return error;
+
+fail:
+	kfree(buf);
+	return error;
+}
+
+/**
+ * compare_dents - compare directory entries by hash value
+ * @a: first dent
+ * @b: second dent
+ *
+ * When comparing the hash entries of @a to @b:
+ *   gt: returns 1
+ *   lt: returns -1
+ *   eq: returns 0
+ */
+
+static int compare_dents(const void *a, const void *b)
+{
+	const struct gfs2_dirent *dent_a, *dent_b;
+	u32 hash_a, hash_b;
+	int ret = 0;
+
+	dent_a = *(const struct gfs2_dirent **)a;
+	hash_a = be32_to_cpu(dent_a->de_hash);
+
+	dent_b = *(const struct gfs2_dirent **)b;
+	hash_b = be32_to_cpu(dent_b->de_hash);
+
+	if (hash_a > hash_b)
+		ret = 1;
+	else if (hash_a < hash_b)
+		ret = -1;
+	else {
+		unsigned int len_a = be16_to_cpu(dent_a->de_name_len);
+		unsigned int len_b = be16_to_cpu(dent_b->de_name_len);
+
+		if (len_a > len_b)
+			ret = 1;
+		else if (len_a < len_b)
+			ret = -1;
+		else
+			ret = memcmp(dent_a + 1, dent_b + 1, len_a);
+	}
+
+	return ret;
+}
+
+/**
+ * do_filldir_main - read out directory entries
+ * @dip: The GFS2 inode
+ * @offset: The offset in the file to read from
+ * @opaque: opaque data to pass to filldir
+ * @filldir: The function to pass entries to
+ * @darr: an array of struct gfs2_dirent pointers to read
+ * @entries: the number of entries in darr
+ * @copied: pointer to int that's non-zero if a entry has been copied out
+ *
+ * Jump through some hoops to make sure that if there are hash collsions,
+ * they are read out at the beginning of a buffer.  We want to minimize
+ * the possibility that they will fall into different readdir buffers or
+ * that someone will want to seek to that location.
+ *
+ * Returns: errno, >0 on exception from filldir
+ */
+
+static int do_filldir_main(struct gfs2_inode *dip, u64 *offset,
+			   void *opaque, gfs2_filldir_t filldir,
+			   const struct gfs2_dirent **darr, u32 entries,
+			   int *copied)
+{
+	const struct gfs2_dirent *dent, *dent_next;
+	struct gfs2_inum inum;
+	u64 off, off_next;
+	unsigned int x, y;
+	int run = 0;
+	int error = 0;
+
+	sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
+
+	dent_next = darr[0];
+	off_next = be32_to_cpu(dent_next->de_hash);
+	off_next = gfs2_disk_hash2offset(off_next);
+
+	for (x = 0, y = 1; x < entries; x++, y++) {
+		dent = dent_next;
+		off = off_next;
+
+		if (y < entries) {
+			dent_next = darr[y];
+			off_next = be32_to_cpu(dent_next->de_hash);
+			off_next = gfs2_disk_hash2offset(off_next);
+
+			if (off < *offset)
+				continue;
+			*offset = off;
+
+			if (off_next == off) {
+				if (*copied && !run)
+					return 1;
+				run = 1;
+			} else
+				run = 0;
+		} else {
+			if (off < *offset)
+				continue;
+			*offset = off;
+		}
+
+		gfs2_inum_in(&inum, (char *)&dent->de_inum);
+
+		error = filldir(opaque, (const char *)(dent + 1),
+				be16_to_cpu(dent->de_name_len),
+				off, &inum,
+				be16_to_cpu(dent->de_type));
+		if (error)
+			return 1;
+
+		*copied = 1;
+	}
+
+	/* Increment the *offset by one, so the next time we come into the
+	   do_filldir fxn, we get the next entry instead of the last one in the
+	   current leaf */
+
+	(*offset)++;
+
+	return 0;
+}
+
+static int gfs2_dir_read_leaf(struct inode *inode, u64 *offset, void *opaque,
+			      gfs2_filldir_t filldir, int *copied,
+			      unsigned *depth, u64 leaf_no)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct buffer_head *bh;
+	struct gfs2_leaf *lf;
+	unsigned entries = 0;
+	unsigned leaves = 0;
+	const struct gfs2_dirent **darr, *dent;
+	struct dirent_gather g;
+	struct buffer_head **larr;
+	int leaf = 0;
+	int error, i;
+	u64 lfn = leaf_no;
+
+	do {
+		error = get_leaf(ip, lfn, &bh);
+		if (error)
+			goto out;
+		lf = (struct gfs2_leaf *)bh->b_data;
+		if (leaves == 0)
+			*depth = be16_to_cpu(lf->lf_depth);
+		entries += be16_to_cpu(lf->lf_entries);
+		leaves++;
+		lfn = be64_to_cpu(lf->lf_next);
+		brelse(bh);
+	} while(lfn);
+
+	if (!entries)
+		return 0;
+
+	error = -ENOMEM;
+	larr = vmalloc((leaves + entries) * sizeof(void *));
+	if (!larr)
+		goto out;
+	darr = (const struct gfs2_dirent **)(larr + leaves);
+	g.pdent = darr;
+	g.offset = 0;
+	lfn = leaf_no;
+
+	do {
+		error = get_leaf(ip, lfn, &bh);
+		if (error)
+			goto out_kfree;
+		lf = (struct gfs2_leaf *)bh->b_data;
+		lfn = be64_to_cpu(lf->lf_next);
+		if (lf->lf_entries) {
+			dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
+						gfs2_dirent_gather, NULL, &g);
+			error = PTR_ERR(dent);
+			if (IS_ERR(dent)) {
+				goto out_kfree;
+			}
+			error = 0;
+			larr[leaf++] = bh;
+		} else {
+			brelse(bh);
+		}
+	} while(lfn);
+
+	error = do_filldir_main(ip, offset, opaque, filldir, darr,
+				entries, copied);
+out_kfree:
+	for(i = 0; i < leaf; i++)
+		brelse(larr[i]);
+	vfree(larr);
+out:
+	return error;
+}
+
+/**
+ * dir_e_read - Reads the entries from a directory into a filldir buffer
+ * @dip: dinode pointer
+ * @offset: the hash of the last entry read shifted to the right once
+ * @opaque: buffer for the filldir function to fill
+ * @filldir: points to the filldir function to use
+ *
+ * Returns: errno
+ */
+
+static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
+		      gfs2_filldir_t filldir)
+{
+	struct gfs2_inode *dip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	u32 hsize, len = 0;
+	u32 ht_offset, lp_offset, ht_offset_cur = -1;
+	u32 hash, index;
+	u64 *lp;
+	int copied = 0;
+	int error = 0;
+	unsigned depth = 0;
+
+	hsize = 1 << dip->i_di.di_depth;
+	if (hsize * sizeof(u64) != dip->i_di.di_size) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
+
+	hash = gfs2_dir_offset2hash(*offset);
+	index = hash >> (32 - dip->i_di.di_depth);
+
+	lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+	if (!lp)
+		return -ENOMEM;
+
+	while (index < hsize) {
+		lp_offset = index & (sdp->sd_hash_ptrs - 1);
+		ht_offset = index - lp_offset;
+
+		if (ht_offset_cur != ht_offset) {
+			error = gfs2_dir_read_data(dip, (char *)lp,
+						ht_offset * sizeof(u64),
+						sdp->sd_hash_bsize, 1);
+			if (error != sdp->sd_hash_bsize) {
+				if (error >= 0)
+					error = -EIO;
+				goto out;
+			}
+			ht_offset_cur = ht_offset;
+		}
+
+		error = gfs2_dir_read_leaf(inode, offset, opaque, filldir,
+					   &copied, &depth,
+					   be64_to_cpu(lp[lp_offset]));
+		if (error)
+			break;
+
+		len = 1 << (dip->i_di.di_depth - depth);
+		index = (index & ~(len - 1)) + len;
+	}
+
+out:
+	kfree(lp);
+	if (error > 0)
+		error = 0;
+	return error;
+}
+
+int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
+		  gfs2_filldir_t filldir)
+{
+	struct gfs2_inode *dip = GFS2_I(inode);
+	struct dirent_gather g;
+	const struct gfs2_dirent **darr, *dent;
+	struct buffer_head *dibh;
+	int copied = 0;
+	int error;
+
+	if (!dip->i_di.di_entries)
+		return 0;
+
+	if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
+		return dir_e_read(inode, offset, opaque, filldir);
+
+	if (!gfs2_is_stuffed(dip)) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
+
+	error = gfs2_meta_inode_buffer(dip, &dibh);
+	if (error)
+		return error;
+
+	error = -ENOMEM;
+	darr = kmalloc(dip->i_di.di_entries * sizeof(struct gfs2_dirent *),
+		       GFP_KERNEL);
+	if (darr) {
+		g.pdent = darr;
+		g.offset = 0;
+		dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
+					gfs2_dirent_gather, NULL, &g);
+		if (IS_ERR(dent)) {
+			error = PTR_ERR(dent);
+			goto out;
+		}
+		error = do_filldir_main(dip, offset, opaque, filldir, darr,
+					dip->i_di.di_entries, &copied);
+out:
+		kfree(darr);
+	}
+
+	if (error > 0)
+		error = 0;
+
+	brelse(dibh);
+
+	return error;
+}
+
+/**
+ * gfs2_dir_search - Search a directory
+ * @dip: The GFS2 inode
+ * @filename:
+ * @inode:
+ *
+ * This routine searches a directory for a file or another directory.
+ * Assumes a glock is held on dip.
+ *
+ * Returns: errno
+ */
+
+int gfs2_dir_search(struct inode *dir, const struct qstr *name,
+		    struct gfs2_inum *inum, unsigned int *type)
+{
+	struct buffer_head *bh;
+	struct gfs2_dirent *dent;
+
+	dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
+	if (dent) {
+		if (IS_ERR(dent))
+			return PTR_ERR(dent);
+		if (inum)
+			gfs2_inum_in(inum, (char *)&dent->de_inum);
+		if (type)
+			*type = be16_to_cpu(dent->de_type);
+		brelse(bh);
+		return 0;
+	}
+	return -ENOENT;
+}
+
+static int dir_new_leaf(struct inode *inode, const struct qstr *name)
+{
+	struct buffer_head *bh, *obh;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_leaf *leaf, *oleaf;
+	int error;
+	u32 index;
+	u64 bn;
+
+	index = name->hash >> (32 - ip->i_di.di_depth);
+	error = get_first_leaf(ip, index, &obh);
+	if (error)
+		return error;
+	do {
+		oleaf = (struct gfs2_leaf *)obh->b_data;
+		bn = be64_to_cpu(oleaf->lf_next);
+		if (!bn)
+			break;
+		brelse(obh);
+		error = get_leaf(ip, bn, &obh);
+		if (error)
+			return error;
+	} while(1);
+
+	gfs2_trans_add_bh(ip->i_gl, obh, 1);
+
+	leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
+	if (!leaf) {
+		brelse(obh);
+		return -ENOSPC;
+	}
+	oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
+	brelse(bh);
+	brelse(obh);
+
+	error = gfs2_meta_inode_buffer(ip, &bh);
+	if (error)
+		return error;
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	ip->i_di.di_blocks++;
+	gfs2_dinode_out(&ip->i_di, bh->b_data);
+	brelse(bh);
+	return 0;
+}
+
+/**
+ * gfs2_dir_add - Add new filename into directory
+ * @dip: The GFS2 inode
+ * @filename: The new name
+ * @inode: The inode number of the entry
+ * @type: The type of the entry
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+int gfs2_dir_add(struct inode *inode, const struct qstr *name,
+		 const struct gfs2_inum *inum, unsigned type)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct buffer_head *bh;
+	struct gfs2_dirent *dent;
+	struct gfs2_leaf *leaf;
+	int error;
+
+	while(1) {
+		dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
+					  &bh);
+		if (dent) {
+			if (IS_ERR(dent))
+				return PTR_ERR(dent);
+			dent = gfs2_init_dirent(inode, dent, name, bh);
+			gfs2_inum_out(inum, (char *)&dent->de_inum);
+			dent->de_type = cpu_to_be16(type);
+			if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+				leaf = (struct gfs2_leaf *)bh->b_data;
+				leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1);
+			}
+			brelse(bh);
+			error = gfs2_meta_inode_buffer(ip, &bh);
+			if (error)
+				break;
+			gfs2_trans_add_bh(ip->i_gl, bh, 1);
+			ip->i_di.di_entries++;
+			ip->i_di.di_mtime = ip->i_di.di_ctime = get_seconds();
+			gfs2_dinode_out(&ip->i_di, bh->b_data);
+			brelse(bh);
+			error = 0;
+			break;
+		}
+		if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+			error = dir_make_exhash(inode);
+			if (error)
+				break;
+			continue;
+		}
+		error = dir_split_leaf(inode, name);
+		if (error == 0)
+			continue;
+		if (error < 0)
+			break;
+		if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) {
+			error = dir_double_exhash(ip);
+			if (error)
+				break;
+			error = dir_split_leaf(inode, name);
+			if (error < 0)
+				break;
+			if (error == 0)
+				continue;
+		}
+		error = dir_new_leaf(inode, name);
+		if (!error)
+			continue;
+		error = -ENOSPC;
+		break;
+	}
+	return error;
+}
+
+
+/**
+ * gfs2_dir_del - Delete a directory entry
+ * @dip: The GFS2 inode
+ * @filename: The filename
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
+{
+	struct gfs2_dirent *dent, *prev = NULL;
+	struct buffer_head *bh;
+	int error;
+
+	/* Returns _either_ the entry (if its first in block) or the
+	   previous entry otherwise */
+	dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh);
+	if (!dent) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
+	if (IS_ERR(dent)) {
+		gfs2_consist_inode(dip);
+		return PTR_ERR(dent);
+	}
+	/* If not first in block, adjust pointers accordingly */
+	if (gfs2_dirent_find(dent, name, NULL) == 0) {
+		prev = dent;
+		dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len));
+	}
+
+	dirent_del(dip, bh, prev, dent);
+	if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+		struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
+		u16 entries = be16_to_cpu(leaf->lf_entries);
+		if (!entries)
+			gfs2_consist_inode(dip);
+		leaf->lf_entries = cpu_to_be16(--entries);
+	}
+	brelse(bh);
+
+	error = gfs2_meta_inode_buffer(dip, &bh);
+	if (error)
+		return error;
+
+	if (!dip->i_di.di_entries)
+		gfs2_consist_inode(dip);
+	gfs2_trans_add_bh(dip->i_gl, bh, 1);
+	dip->i_di.di_entries--;
+	dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
+	gfs2_dinode_out(&dip->i_di, bh->b_data);
+	brelse(bh);
+	mark_inode_dirty(&dip->i_inode);
+
+	return error;
+}
+
+/**
+ * gfs2_dir_mvino - Change inode number of directory entry
+ * @dip: The GFS2 inode
+ * @filename:
+ * @new_inode:
+ *
+ * This routine changes the inode number of a directory entry.  It's used
+ * by rename to change ".." when a directory is moved.
+ * Assumes a glock is held on dvp.
+ *
+ * Returns: errno
+ */
+
+int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+		   struct gfs2_inum *inum, unsigned int new_type)
+{
+	struct buffer_head *bh;
+	struct gfs2_dirent *dent;
+	int error;
+
+	dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh);
+	if (!dent) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
+	if (IS_ERR(dent))
+		return PTR_ERR(dent);
+
+	gfs2_trans_add_bh(dip->i_gl, bh, 1);
+	gfs2_inum_out(inum, (char *)&dent->de_inum);
+	dent->de_type = cpu_to_be16(new_type);
+
+	if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+		brelse(bh);
+		error = gfs2_meta_inode_buffer(dip, &bh);
+		if (error)
+			return error;
+		gfs2_trans_add_bh(dip->i_gl, bh, 1);
+	}
+
+	dip->i_di.di_mtime = dip->i_di.di_ctime = get_seconds();
+	gfs2_dinode_out(&dip->i_di, bh->b_data);
+	brelse(bh);
+	return 0;
+}
+
+/**
+ * foreach_leaf - call a function for each leaf in a directory
+ * @dip: the directory
+ * @lc: the function to call for each each
+ * @data: private data to pass to it
+ *
+ * Returns: errno
+ */
+
+static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct buffer_head *bh;
+	struct gfs2_leaf *leaf;
+	u32 hsize, len;
+	u32 ht_offset, lp_offset, ht_offset_cur = -1;
+	u32 index = 0;
+	u64 *lp;
+	u64 leaf_no;
+	int error = 0;
+
+	hsize = 1 << dip->i_di.di_depth;
+	if (hsize * sizeof(u64) != dip->i_di.di_size) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
+
+	lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL);
+	if (!lp)
+		return -ENOMEM;
+
+	while (index < hsize) {
+		lp_offset = index & (sdp->sd_hash_ptrs - 1);
+		ht_offset = index - lp_offset;
+
+		if (ht_offset_cur != ht_offset) {
+			error = gfs2_dir_read_data(dip, (char *)lp,
+						ht_offset * sizeof(u64),
+						sdp->sd_hash_bsize, 1);
+			if (error != sdp->sd_hash_bsize) {
+				if (error >= 0)
+					error = -EIO;
+				goto out;
+			}
+			ht_offset_cur = ht_offset;
+		}
+
+		leaf_no = be64_to_cpu(lp[lp_offset]);
+		if (leaf_no) {
+			error = get_leaf(dip, leaf_no, &bh);
+			if (error)
+				goto out;
+			leaf = (struct gfs2_leaf *)bh->b_data;
+			len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth));
+			brelse(bh);
+
+			error = lc(dip, index, len, leaf_no, data);
+			if (error)
+				goto out;
+
+			index = (index & ~(len - 1)) + len;
+		} else
+			index++;
+	}
+
+	if (index != hsize) {
+		gfs2_consist_inode(dip);
+		error = -EIO;
+	}
+
+out:
+	kfree(lp);
+
+	return error;
+}
+
+/**
+ * leaf_dealloc - Deallocate a directory leaf
+ * @dip: the directory
+ * @index: the hash table offset in the directory
+ * @len: the number of pointers to this leaf
+ * @leaf_no: the leaf number
+ * @data: not used
+ *
+ * Returns: errno
+ */
+
+static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
+			u64 leaf_no, void *data)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct gfs2_leaf *tmp_leaf;
+	struct gfs2_rgrp_list rlist;
+	struct buffer_head *bh, *dibh;
+	u64 blk, nblk;
+	unsigned int rg_blocks = 0, l_blocks = 0;
+	char *ht;
+	unsigned int x, size = len * sizeof(u64);
+	int error;
+
+	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+
+	ht = kzalloc(size, GFP_KERNEL);
+	if (!ht)
+		return -ENOMEM;
+
+	gfs2_alloc_get(dip);
+
+	error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto out;
+
+	error = gfs2_rindex_hold(sdp, &dip->i_alloc.al_ri_gh);
+	if (error)
+		goto out_qs;
+
+	/*  Count the number of leaves  */
+
+	for (blk = leaf_no; blk; blk = nblk) {
+		error = get_leaf(dip, blk, &bh);
+		if (error)
+			goto out_rlist;
+		tmp_leaf = (struct gfs2_leaf *)bh->b_data;
+		nblk = be64_to_cpu(tmp_leaf->lf_next);
+		brelse(bh);
+
+		gfs2_rlist_add(sdp, &rlist, blk);
+		l_blocks++;
+	}
+
+	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+
+	for (x = 0; x < rlist.rl_rgrps; x++) {
+		struct gfs2_rgrpd *rgd;
+		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+		rg_blocks += rgd->rd_ri.ri_length;
+	}
+
+	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+	if (error)
+		goto out_rlist;
+
+	error = gfs2_trans_begin(sdp,
+			rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) +
+			RES_DINODE + RES_STATFS + RES_QUOTA, l_blocks);
+	if (error)
+		goto out_rg_gunlock;
+
+	for (blk = leaf_no; blk; blk = nblk) {
+		error = get_leaf(dip, blk, &bh);
+		if (error)
+			goto out_end_trans;
+		tmp_leaf = (struct gfs2_leaf *)bh->b_data;
+		nblk = be64_to_cpu(tmp_leaf->lf_next);
+		brelse(bh);
+
+		gfs2_free_meta(dip, blk, 1);
+
+		if (!dip->i_di.di_blocks)
+			gfs2_consist_inode(dip);
+		dip->i_di.di_blocks--;
+	}
+
+	error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size);
+	if (error != size) {
+		if (error >= 0)
+			error = -EIO;
+		goto out_end_trans;
+	}
+
+	error = gfs2_meta_inode_buffer(dip, &dibh);
+	if (error)
+		goto out_end_trans;
+
+	gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+	gfs2_dinode_out(&dip->i_di, dibh->b_data);
+	brelse(dibh);
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_rg_gunlock:
+	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist:
+	gfs2_rlist_free(&rlist);
+	gfs2_glock_dq_uninit(&dip->i_alloc.al_ri_gh);
+out_qs:
+	gfs2_quota_unhold(dip);
+out:
+	gfs2_alloc_put(dip);
+	kfree(ht);
+	return error;
+}
+
+/**
+ * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
+ * @dip: the directory
+ *
+ * Dealloc all on-disk directory leaves to FREEMETA state
+ * Change on-disk inode type to "regular file"
+ *
+ * Returns: errno
+ */
+
+int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct buffer_head *bh;
+	int error;
+
+	/* Dealloc on-disk leaves to FREEMETA state */
+	error = foreach_leaf(dip, leaf_dealloc, NULL);
+	if (error)
+		return error;
+
+	/* Make this a regular file in case we crash.
+	   (We don't want to free these blocks a second time.)  */
+
+	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (error)
+		return error;
+
+	error = gfs2_meta_inode_buffer(dip, &bh);
+	if (!error) {
+		gfs2_trans_add_bh(dip->i_gl, bh, 1);
+		((struct gfs2_dinode *)bh->b_data)->di_mode =
+						cpu_to_be32(S_IFREG);
+		brelse(bh);
+	}
+
+	gfs2_trans_end(sdp);
+
+	return error;
+}
+
+/**
+ * gfs2_diradd_alloc_required - find if adding entry will require an allocation
+ * @ip: the file being written to
+ * @filname: the filename that's going to be added
+ *
+ * Returns: 1 if alloc required, 0 if not, -ve on error
+ */
+
+int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
+{
+	struct gfs2_dirent *dent;
+	struct buffer_head *bh;
+
+	dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
+	if (!dent) {
+		return 1;
+	}
+	if (IS_ERR(dent))
+		return PTR_ERR(dent);
+	brelse(bh);
+	return 0;
+}
+
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
new file mode 100644
index 000000000000..371233419b07
--- /dev/null
+++ b/fs/gfs2/dir.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __DIR_DOT_H__
+#define __DIR_DOT_H__
+
+#include <linux/dcache.h>
+
+struct inode;
+struct gfs2_inode;
+struct gfs2_inum;
+
+/**
+ * gfs2_filldir_t - Report a directory entry to the caller of gfs2_dir_read()
+ * @opaque: opaque data used by the function
+ * @name: the name of the directory entry
+ * @length: the length of the name
+ * @offset: the entry's offset in the directory
+ * @inum: the inode number the entry points to
+ * @type: the type of inode the entry points to
+ *
+ * Returns: 0 on success, 1 if buffer full
+ */
+
+typedef int (*gfs2_filldir_t) (void *opaque,
+			      const char *name, unsigned int length,
+			      u64 offset,
+			      struct gfs2_inum *inum, unsigned int type);
+
+int gfs2_dir_search(struct inode *dir, const struct qstr *filename,
+		    struct gfs2_inum *inum, unsigned int *type);
+int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
+		 const struct gfs2_inum *inum, unsigned int type);
+int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
+int gfs2_dir_read(struct inode *inode, u64 * offset, void *opaque,
+		  gfs2_filldir_t filldir);
+int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
+		   struct gfs2_inum *new_inum, unsigned int new_type);
+
+int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
+
+int gfs2_diradd_alloc_required(struct inode *dir,
+			       const struct qstr *filename);
+int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
+			    struct buffer_head **bhp);
+
+static inline u32 gfs2_disk_hash(const char *data, int len)
+{
+        return crc32_le((u32)~0, data, len) ^ (u32)~0;
+}
+
+
+static inline void gfs2_str2qstr(struct qstr *name, const char *fname)
+{
+	name->name = fname;
+	name->len = strlen(fname);
+	name->hash = gfs2_disk_hash(name->name, name->len);
+}
+
+/* N.B. This probably ought to take inum & type as args as well */
+static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent)
+{
+	dent->de_inum.no_addr = cpu_to_be64(0);
+	dent->de_inum.no_formal_ino = cpu_to_be64(0);
+	dent->de_hash = cpu_to_be32(name->hash);
+	dent->de_rec_len = cpu_to_be16(reclen);
+	dent->de_name_len = cpu_to_be16(name->len);
+	dent->de_type = cpu_to_be16(0);
+	memset(dent->__pad, 0, sizeof(dent->__pad));
+	memcpy(dent + 1, name->name, name->len);
+}
+
+#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
new file mode 100644
index 000000000000..92c54e9b0dc3
--- /dev/null
+++ b/fs/gfs2/eaops.c
@@ -0,0 +1,230 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/xattr.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "util.h"
+
+/**
+ * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
+ * @namep: ea name, possibly with type appended
+ *
+ * Returns: GFS2_EATYPE_XXX
+ */
+
+unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
+{
+	unsigned int type;
+
+	if (strncmp(name, "system.", 7) == 0) {
+		type = GFS2_EATYPE_SYS;
+		if (truncated_name)
+			*truncated_name = name + sizeof("system.") - 1;
+	} else if (strncmp(name, "user.", 5) == 0) {
+		type = GFS2_EATYPE_USR;
+		if (truncated_name)
+			*truncated_name = name + sizeof("user.") - 1;
+	} else if (strncmp(name, "security.", 9) == 0) {
+		type = GFS2_EATYPE_SECURITY;
+		if (truncated_name)
+			*truncated_name = name + sizeof("security.") - 1;
+	} else {
+		type = GFS2_EATYPE_UNUSED;
+		if (truncated_name)
+			*truncated_name = NULL;
+	}
+
+	return type;
+}
+
+static int user_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+	struct inode *inode = &ip->i_inode;
+	int error = permission(inode, MAY_READ, NULL);
+	if (error)
+		return error;
+
+	return gfs2_ea_get_i(ip, er);
+}
+
+static int user_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+	struct inode *inode = &ip->i_inode;
+
+	if (S_ISREG(inode->i_mode) ||
+	    (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
+		int error = permission(inode, MAY_WRITE, NULL);
+		if (error)
+			return error;
+	} else
+		return -EPERM;
+
+	return gfs2_ea_set_i(ip, er);
+}
+
+static int user_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+	struct inode *inode = &ip->i_inode;
+
+	if (S_ISREG(inode->i_mode) ||
+	    (S_ISDIR(inode->i_mode) && !(inode->i_mode & S_ISVTX))) {
+		int error = permission(inode, MAY_WRITE, NULL);
+		if (error)
+			return error;
+	} else
+		return -EPERM;
+
+	return gfs2_ea_remove_i(ip, er);
+}
+
+static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+	if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
+	    !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
+	    !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
+	    (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
+	     GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
+		return -EOPNOTSUPP;
+
+
+
+	return gfs2_ea_get_i(ip, er);
+}
+
+static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+	int remove = 0;
+	int error;
+
+	if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
+		if (!(er->er_flags & GFS2_ERF_MODE)) {
+			er->er_mode = ip->i_di.di_mode;
+			er->er_flags |= GFS2_ERF_MODE;
+		}
+		error = gfs2_acl_validate_set(ip, 1, er,
+					      &remove, &er->er_mode);
+		if (error)
+			return error;
+		error = gfs2_ea_set_i(ip, er);
+		if (error)
+			return error;
+		if (remove)
+			gfs2_ea_remove_i(ip, er);
+		return 0;
+
+	} else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
+		error = gfs2_acl_validate_set(ip, 0, er,
+					      &remove, NULL);
+		if (error)
+			return error;
+		if (!remove)
+			error = gfs2_ea_set_i(ip, er);
+		else {
+			error = gfs2_ea_remove_i(ip, er);
+			if (error == -ENODATA)
+				error = 0;
+		}
+		return error;
+	}
+
+	return -EPERM;
+}
+
+static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+	if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
+		int error = gfs2_acl_validate_remove(ip, 1);
+		if (error)
+			return error;
+
+	} else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
+		int error = gfs2_acl_validate_remove(ip, 0);
+		if (error)
+			return error;
+
+	} else
+		return -EPERM;
+
+	return gfs2_ea_remove_i(ip, er);
+}
+
+static int security_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+	struct inode *inode = &ip->i_inode;
+	int error = permission(inode, MAY_READ, NULL);
+	if (error)
+		return error;
+
+	return gfs2_ea_get_i(ip, er);
+}
+
+static int security_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+	struct inode *inode = &ip->i_inode;
+	int error = permission(inode, MAY_WRITE, NULL);
+	if (error)
+		return error;
+
+	return gfs2_ea_set_i(ip, er);
+}
+
+static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+	struct inode *inode = &ip->i_inode;
+	int error = permission(inode, MAY_WRITE, NULL);
+	if (error)
+		return error;
+
+	return gfs2_ea_remove_i(ip, er);
+}
+
+static struct gfs2_eattr_operations gfs2_user_eaops = {
+	.eo_get = user_eo_get,
+	.eo_set = user_eo_set,
+	.eo_remove = user_eo_remove,
+	.eo_name = "user",
+};
+
+struct gfs2_eattr_operations gfs2_system_eaops = {
+	.eo_get = system_eo_get,
+	.eo_set = system_eo_set,
+	.eo_remove = system_eo_remove,
+	.eo_name = "system",
+};
+
+static struct gfs2_eattr_operations gfs2_security_eaops = {
+	.eo_get = security_eo_get,
+	.eo_set = security_eo_set,
+	.eo_remove = security_eo_remove,
+	.eo_name = "security",
+};
+
+struct gfs2_eattr_operations *gfs2_ea_ops[] = {
+	NULL,
+	&gfs2_user_eaops,
+	&gfs2_system_eaops,
+	&gfs2_security_eaops,
+};
+
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
new file mode 100644
index 000000000000..508b4f7a2449
--- /dev/null
+++ b/fs/gfs2/eaops.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __EAOPS_DOT_H__
+#define __EAOPS_DOT_H__
+
+struct gfs2_ea_request;
+struct gfs2_inode;
+
+struct gfs2_eattr_operations {
+	int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
+	int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
+	int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
+	char *eo_name;
+};
+
+unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
+
+extern struct gfs2_eattr_operations gfs2_system_eaops;
+
+extern struct gfs2_eattr_operations *gfs2_ea_ops[];
+
+#endif /* __EAOPS_DOT_H__ */
+
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
new file mode 100644
index 000000000000..a65a4ccfd4dd
--- /dev/null
+++ b/fs/gfs2/eattr.c
@@ -0,0 +1,1501 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/xattr.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+
+/**
+ * ea_calc_size - returns the acutal number of bytes the request will take up
+ *                (not counting any unstuffed data blocks)
+ * @sdp:
+ * @er:
+ * @size:
+ *
+ * Returns: 1 if the EA should be stuffed
+ */
+
+static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
+			unsigned int *size)
+{
+	*size = GFS2_EAREQ_SIZE_STUFFED(er);
+	if (*size <= sdp->sd_jbsize)
+		return 1;
+
+	*size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
+
+	return 0;
+}
+
+static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
+{
+	unsigned int size;
+
+	if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
+		return -ERANGE;
+
+	ea_calc_size(sdp, er, &size);
+
+	/* This can only happen with 512 byte blocks */
+	if (size > sdp->sd_jbsize)
+		return -ERANGE;
+
+	return 0;
+}
+
+typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh,
+			  struct gfs2_ea_header *ea,
+			  struct gfs2_ea_header *prev, void *private);
+
+static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh,
+			ea_call_t ea_call, void *data)
+{
+	struct gfs2_ea_header *ea, *prev = NULL;
+	int error = 0;
+
+	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA))
+		return -EIO;
+
+	for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) {
+		if (!GFS2_EA_REC_LEN(ea))
+			goto fail;
+		if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <=
+						  bh->b_data + bh->b_size))
+			goto fail;
+		if (!GFS2_EATYPE_VALID(ea->ea_type))
+			goto fail;
+
+		error = ea_call(ip, bh, ea, prev, data);
+		if (error)
+			return error;
+
+		if (GFS2_EA_IS_LAST(ea)) {
+			if ((char *)GFS2_EA2NEXT(ea) !=
+			    bh->b_data + bh->b_size)
+				goto fail;
+			break;
+		}
+	}
+
+	return error;
+
+fail:
+	gfs2_consist_inode(ip);
+	return -EIO;
+}
+
+static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
+{
+	struct buffer_head *bh, *eabh;
+	u64 *eablk, *end;
+	int error;
+
+	error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh);
+	if (error)
+		return error;
+
+	if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
+		error = ea_foreach_i(ip, bh, ea_call, data);
+		goto out;
+	}
+
+	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) {
+		error = -EIO;
+		goto out;
+	}
+
+	eablk = (u64 *)(bh->b_data + sizeof(struct gfs2_meta_header));
+	end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs;
+
+	for (; eablk < end; eablk++) {
+		u64 bn;
+
+		if (!*eablk)
+			break;
+		bn = be64_to_cpu(*eablk);
+
+		error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh);
+		if (error)
+			break;
+		error = ea_foreach_i(ip, eabh, ea_call, data);
+		brelse(eabh);
+		if (error)
+			break;
+	}
+out:
+	brelse(bh);
+	return error;
+}
+
+struct ea_find {
+	struct gfs2_ea_request *ef_er;
+	struct gfs2_ea_location *ef_el;
+};
+
+static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
+		     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+		     void *private)
+{
+	struct ea_find *ef = private;
+	struct gfs2_ea_request *er = ef->ef_er;
+
+	if (ea->ea_type == GFS2_EATYPE_UNUSED)
+		return 0;
+
+	if (ea->ea_type == er->er_type) {
+		if (ea->ea_name_len == er->er_name_len &&
+		    !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
+			struct gfs2_ea_location *el = ef->ef_el;
+			get_bh(bh);
+			el->el_bh = bh;
+			el->el_ea = ea;
+			el->el_prev = prev;
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+		 struct gfs2_ea_location *el)
+{
+	struct ea_find ef;
+	int error;
+
+	ef.ef_er = er;
+	ef.ef_el = el;
+
+	memset(el, 0, sizeof(struct gfs2_ea_location));
+
+	error = ea_foreach(ip, ea_find_i, &ef);
+	if (error > 0)
+		return 0;
+
+	return error;
+}
+
+/**
+ * ea_dealloc_unstuffed -
+ * @ip:
+ * @bh:
+ * @ea:
+ * @prev:
+ * @private:
+ *
+ * Take advantage of the fact that all unstuffed blocks are
+ * allocated from the same RG.  But watch, this may not always
+ * be true.
+ *
+ * Returns: errno
+ */
+
+static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+				struct gfs2_ea_header *ea,
+				struct gfs2_ea_header *prev, void *private)
+{
+	int *leave = private;
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_holder rg_gh;
+	struct buffer_head *dibh;
+	u64 *dataptrs, bn = 0;
+	u64 bstart = 0;
+	unsigned int blen = 0;
+	unsigned int blks = 0;
+	unsigned int x;
+	int error;
+
+	if (GFS2_EA_IS_STUFFED(ea))
+		return 0;
+
+	dataptrs = GFS2_EA2DATAPTRS(ea);
+	for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
+		if (*dataptrs) {
+			blks++;
+			bn = be64_to_cpu(*dataptrs);
+		}
+	}
+	if (!blks)
+		return 0;
+
+	rgd = gfs2_blk2rgrpd(sdp, bn);
+	if (!rgd) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
+	if (error)
+		return error;
+
+	error = gfs2_trans_begin(sdp, rgd->rd_ri.ri_length + RES_DINODE +
+				 RES_EATTR + RES_STATFS + RES_QUOTA, blks);
+	if (error)
+		goto out_gunlock;
+
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+
+	dataptrs = GFS2_EA2DATAPTRS(ea);
+	for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
+		if (!*dataptrs)
+			break;
+		bn = be64_to_cpu(*dataptrs);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart)
+				gfs2_free_meta(ip, bstart, blen);
+			bstart = bn;
+			blen = 1;
+		}
+
+		*dataptrs = 0;
+		if (!ip->i_di.di_blocks)
+			gfs2_consist_inode(ip);
+		ip->i_di.di_blocks--;
+	}
+	if (bstart)
+		gfs2_free_meta(ip, bstart, blen);
+
+	if (prev && !leave) {
+		u32 len;
+
+		len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+		prev->ea_rec_len = cpu_to_be32(len);
+
+		if (GFS2_EA_IS_LAST(ea))
+			prev->ea_flags |= GFS2_EAFLAG_LAST;
+	} else {
+		ea->ea_type = GFS2_EATYPE_UNUSED;
+		ea->ea_num_ptrs = 0;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		ip->i_di.di_ctime = get_seconds();
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(sdp);
+
+out_gunlock:
+	gfs2_glock_dq_uninit(&rg_gh);
+	return error;
+}
+
+static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
+			       struct gfs2_ea_header *ea,
+			       struct gfs2_ea_header *prev, int leave)
+{
+	struct gfs2_alloc *al;
+	int error;
+
+	al = gfs2_alloc_get(ip);
+
+	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto out_alloc;
+
+	error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
+	if (error)
+		goto out_quota;
+
+	error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL);
+
+	gfs2_glock_dq_uninit(&al->al_ri_gh);
+
+out_quota:
+	gfs2_quota_unhold(ip);
+out_alloc:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+struct ea_list {
+	struct gfs2_ea_request *ei_er;
+	unsigned int ei_size;
+};
+
+static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
+		     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+		     void *private)
+{
+	struct ea_list *ei = private;
+	struct gfs2_ea_request *er = ei->ei_er;
+	unsigned int ea_size = gfs2_ea_strlen(ea);
+
+	if (ea->ea_type == GFS2_EATYPE_UNUSED)
+		return 0;
+
+	if (er->er_data_len) {
+		char *prefix = NULL;
+		unsigned int l = 0;
+		char c = 0;
+
+		if (ei->ei_size + ea_size > er->er_data_len)
+			return -ERANGE;
+
+		switch (ea->ea_type) {
+		case GFS2_EATYPE_USR:
+			prefix = "user.";
+			l = 5;
+			break;
+		case GFS2_EATYPE_SYS:
+			prefix = "system.";
+			l = 7;
+			break;
+		case GFS2_EATYPE_SECURITY:
+			prefix = "security.";
+			l = 9;
+			break;
+		}
+
+		BUG_ON(l == 0);
+
+		memcpy(er->er_data + ei->ei_size, prefix, l);
+		memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea),
+		       ea->ea_name_len);
+		memcpy(er->er_data + ei->ei_size + ea_size - 1, &c, 1);
+	}
+
+	ei->ei_size += ea_size;
+
+	return 0;
+}
+
+/**
+ * gfs2_ea_list -
+ * @ip:
+ * @er:
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+
+int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+	struct gfs2_holder i_gh;
+	int error;
+
+	if (!er->er_data || !er->er_data_len) {
+		er->er_data = NULL;
+		er->er_data_len = 0;
+	}
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+	if (error)
+		return error;
+
+	if (ip->i_di.di_eattr) {
+		struct ea_list ei = { .ei_er = er, .ei_size = 0 };
+
+		error = ea_foreach(ip, ea_list_i, &ei);
+		if (!error)
+			error = ei.ei_size;
+	}
+
+	gfs2_glock_dq_uninit(&i_gh);
+
+	return error;
+}
+
+/**
+ * ea_get_unstuffed - actually copies the unstuffed data into the
+ *                    request buffer
+ * @ip: The GFS2 inode
+ * @ea: The extended attribute header structure
+ * @data: The data to be copied
+ *
+ * Returns: errno
+ */
+
+static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+			    char *data)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head **bh;
+	unsigned int amount = GFS2_EA_DATA_LEN(ea);
+	unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
+	u64 *dataptrs = GFS2_EA2DATAPTRS(ea);
+	unsigned int x;
+	int error = 0;
+
+	bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+	if (!bh)
+		return -ENOMEM;
+
+	for (x = 0; x < nptrs; x++) {
+		error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+				       bh + x);
+		if (error) {
+			while (x--)
+				brelse(bh[x]);
+			goto out;
+		}
+		dataptrs++;
+	}
+
+	for (x = 0; x < nptrs; x++) {
+		error = gfs2_meta_wait(sdp, bh[x]);
+		if (error) {
+			for (; x < nptrs; x++)
+				brelse(bh[x]);
+			goto out;
+		}
+		if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
+			for (; x < nptrs; x++)
+				brelse(bh[x]);
+			error = -EIO;
+			goto out;
+		}
+
+		memcpy(data, bh[x]->b_data + sizeof(struct gfs2_meta_header),
+		       (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
+
+		amount -= sdp->sd_jbsize;
+		data += sdp->sd_jbsize;
+
+		brelse(bh[x]);
+	}
+
+out:
+	kfree(bh);
+	return error;
+}
+
+int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+		     char *data)
+{
+	if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+		memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea));
+		return 0;
+	} else
+		return ea_get_unstuffed(ip, el->el_ea, data);
+}
+
+/**
+ * gfs2_ea_get_i -
+ * @ip: The GFS2 inode
+ * @er: The request structure
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+
+int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+	struct gfs2_ea_location el;
+	int error;
+
+	if (!ip->i_di.di_eattr)
+		return -ENODATA;
+
+	error = gfs2_ea_find(ip, er, &el);
+	if (error)
+		return error;
+	if (!el.el_ea)
+		return -ENODATA;
+
+	if (er->er_data_len) {
+		if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
+			error =  -ERANGE;
+		else
+			error = gfs2_ea_get_copy(ip, &el, er->er_data);
+	}
+	if (!error)
+		error = GFS2_EA_DATA_LEN(el.el_ea);
+
+	brelse(el.el_bh);
+
+	return error;
+}
+
+/**
+ * gfs2_ea_get -
+ * @ip: The GFS2 inode
+ * @er: The request structure
+ *
+ * Returns: actual size of data on success, -errno on error
+ */
+
+int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+	struct gfs2_holder i_gh;
+	int error;
+
+	if (!er->er_name_len ||
+	    er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+		return -EINVAL;
+	if (!er->er_data || !er->er_data_len) {
+		er->er_data = NULL;
+		er->er_data_len = 0;
+	}
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+	if (error)
+		return error;
+
+	error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
+
+	gfs2_glock_dq_uninit(&i_gh);
+
+	return error;
+}
+
+/**
+ * ea_alloc_blk - allocates a new block for extended attributes.
+ * @ip: A pointer to the inode that's getting extended attributes
+ * @bhp: Pointer to pointer to a struct buffer_head
+ *
+ * Returns: errno
+ */
+
+static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_ea_header *ea;
+	u64 block;
+
+	block = gfs2_alloc_meta(ip);
+
+	*bhp = gfs2_meta_new(ip->i_gl, block);
+	gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
+	gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
+	gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
+
+	ea = GFS2_EA_BH2FIRST(*bhp);
+	ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
+	ea->ea_type = GFS2_EATYPE_UNUSED;
+	ea->ea_flags = GFS2_EAFLAG_LAST;
+	ea->ea_num_ptrs = 0;
+
+	ip->i_di.di_blocks++;
+
+	return 0;
+}
+
+/**
+ * ea_write - writes the request info to an ea, creating new blocks if
+ *            necessary
+ * @ip: inode that is being modified
+ * @ea: the location of the new ea in a block
+ * @er: the write request
+ *
+ * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags
+ *
+ * returns : errno
+ */
+
+static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
+		    struct gfs2_ea_request *er)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+	ea->ea_data_len = cpu_to_be32(er->er_data_len);
+	ea->ea_name_len = er->er_name_len;
+	ea->ea_type = er->er_type;
+	ea->__pad = 0;
+
+	memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len);
+
+	if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) {
+		ea->ea_num_ptrs = 0;
+		memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len);
+	} else {
+		u64 *dataptr = GFS2_EA2DATAPTRS(ea);
+		const char *data = er->er_data;
+		unsigned int data_len = er->er_data_len;
+		unsigned int copy;
+		unsigned int x;
+
+		ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize);
+		for (x = 0; x < ea->ea_num_ptrs; x++) {
+			struct buffer_head *bh;
+			u64 block;
+			int mh_size = sizeof(struct gfs2_meta_header);
+
+			block = gfs2_alloc_meta(ip);
+
+			bh = gfs2_meta_new(ip->i_gl, block);
+			gfs2_trans_add_bh(ip->i_gl, bh, 1);
+			gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
+
+			ip->i_di.di_blocks++;
+
+			copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
+							   data_len;
+			memcpy(bh->b_data + mh_size, data, copy);
+			if (copy < sdp->sd_jbsize)
+				memset(bh->b_data + mh_size + copy, 0,
+				       sdp->sd_jbsize - copy);
+
+			*dataptr++ = cpu_to_be64(bh->b_blocknr);
+			data += copy;
+			data_len -= copy;
+
+			brelse(bh);
+		}
+
+		gfs2_assert_withdraw(sdp, !data_len);
+	}
+
+	return 0;
+}
+
+typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip,
+				   struct gfs2_ea_request *er, void *private);
+
+static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+			     unsigned int blks,
+			     ea_skeleton_call_t skeleton_call, void *private)
+{
+	struct gfs2_alloc *al;
+	struct buffer_head *dibh;
+	int error;
+
+	al = gfs2_alloc_get(ip);
+
+	error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto out;
+
+	error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+	if (error)
+		goto out_gunlock_q;
+
+	al->al_requested = blks;
+
+	error = gfs2_inplace_reserve(ip);
+	if (error)
+		goto out_gunlock_q;
+
+	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
+				 blks + al->al_rgd->rd_ri.ri_length +
+				 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
+	if (error)
+		goto out_ipres;
+
+	error = skeleton_call(ip, er, private);
+	if (error)
+		goto out_end_trans;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		if (er->er_flags & GFS2_ERF_MODE) {
+			gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
+					    (ip->i_di.di_mode & S_IFMT) ==
+					    (er->er_mode & S_IFMT));
+			ip->i_di.di_mode = er->er_mode;
+		}
+		ip->i_di.di_ctime = get_seconds();
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		brelse(dibh);
+	}
+
+out_end_trans:
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+out_ipres:
+	gfs2_inplace_release(ip);
+out_gunlock_q:
+	gfs2_quota_unlock(ip);
+out:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+		     void *private)
+{
+	struct buffer_head *bh;
+	int error;
+
+	error = ea_alloc_blk(ip, &bh);
+	if (error)
+		return error;
+
+	ip->i_di.di_eattr = bh->b_blocknr;
+	error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
+
+	brelse(bh);
+
+	return error;
+}
+
+/**
+ * ea_init - initializes a new eattr block
+ * @ip:
+ * @er:
+ *
+ * Returns: errno
+ */
+
+static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+	unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
+	unsigned int blks = 1;
+
+	if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
+		blks += DIV_ROUND_UP(er->er_data_len, jbsize);
+
+	return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
+}
+
+static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
+{
+	u32 ea_size = GFS2_EA_SIZE(ea);
+	struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea +
+				     ea_size);
+	u32 new_size = GFS2_EA_REC_LEN(ea) - ea_size;
+	int last = ea->ea_flags & GFS2_EAFLAG_LAST;
+
+	ea->ea_rec_len = cpu_to_be32(ea_size);
+	ea->ea_flags ^= last;
+
+	new->ea_rec_len = cpu_to_be32(new_size);
+	new->ea_flags = last;
+
+	return new;
+}
+
+static void ea_set_remove_stuffed(struct gfs2_inode *ip,
+				  struct gfs2_ea_location *el)
+{
+	struct gfs2_ea_header *ea = el->el_ea;
+	struct gfs2_ea_header *prev = el->el_prev;
+	u32 len;
+
+	gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+
+	if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
+		ea->ea_type = GFS2_EATYPE_UNUSED;
+		return;
+	} else if (GFS2_EA2NEXT(prev) != ea) {
+		prev = GFS2_EA2NEXT(prev);
+		gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea);
+	}
+
+	len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+	prev->ea_rec_len = cpu_to_be32(len);
+
+	if (GFS2_EA_IS_LAST(ea))
+		prev->ea_flags |= GFS2_EAFLAG_LAST;
+}
+
+struct ea_set {
+	int ea_split;
+
+	struct gfs2_ea_request *es_er;
+	struct gfs2_ea_location *es_el;
+
+	struct buffer_head *es_bh;
+	struct gfs2_ea_header *es_ea;
+};
+
+static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
+				 struct gfs2_ea_header *ea, struct ea_set *es)
+{
+	struct gfs2_ea_request *er = es->es_er;
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0);
+	if (error)
+		return error;
+
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+
+	if (es->ea_split)
+		ea = ea_split_ea(ea);
+
+	ea_write(ip, ea, er);
+
+	if (es->es_el)
+		ea_set_remove_stuffed(ip, es->es_el);
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out;
+
+	if (er->er_flags & GFS2_ERF_MODE) {
+		gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
+			(ip->i_di.di_mode & S_IFMT) == (er->er_mode & S_IFMT));
+		ip->i_di.di_mode = er->er_mode;
+	}
+	ip->i_di.di_ctime = get_seconds();
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(&ip->i_di, dibh->b_data);
+	brelse(dibh);
+out:
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+	return error;
+}
+
+static int ea_set_simple_alloc(struct gfs2_inode *ip,
+			       struct gfs2_ea_request *er, void *private)
+{
+	struct ea_set *es = private;
+	struct gfs2_ea_header *ea = es->es_ea;
+	int error;
+
+	gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
+
+	if (es->ea_split)
+		ea = ea_split_ea(ea);
+
+	error = ea_write(ip, ea, er);
+	if (error)
+		return error;
+
+	if (es->es_el)
+		ea_set_remove_stuffed(ip, es->es_el);
+
+	return 0;
+}
+
+static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
+			 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
+			 void *private)
+{
+	struct ea_set *es = private;
+	unsigned int size;
+	int stuffed;
+	int error;
+
+	stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size);
+
+	if (ea->ea_type == GFS2_EATYPE_UNUSED) {
+		if (GFS2_EA_REC_LEN(ea) < size)
+			return 0;
+		if (!GFS2_EA_IS_STUFFED(ea)) {
+			error = ea_remove_unstuffed(ip, bh, ea, prev, 1);
+			if (error)
+				return error;
+		}
+		es->ea_split = 0;
+	} else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size)
+		es->ea_split = 1;
+	else
+		return 0;
+
+	if (stuffed) {
+		error = ea_set_simple_noalloc(ip, bh, ea, es);
+		if (error)
+			return error;
+	} else {
+		unsigned int blks;
+
+		es->es_bh = bh;
+		es->es_ea = ea;
+		blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len,
+					GFS2_SB(&ip->i_inode)->sd_jbsize);
+
+		error = ea_alloc_skeleton(ip, es->es_er, blks,
+					  ea_set_simple_alloc, es);
+		if (error)
+			return error;
+	}
+
+	return 1;
+}
+
+static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+			void *private)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *indbh, *newbh;
+	u64 *eablk;
+	int error;
+	int mh_size = sizeof(struct gfs2_meta_header);
+
+	if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+		u64 *end;
+
+		error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT,
+				       &indbh);
+		if (error)
+			return error;
+
+		if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+			error = -EIO;
+			goto out;
+		}
+
+		eablk = (u64 *)(indbh->b_data + mh_size);
+		end = eablk + sdp->sd_inptrs;
+
+		for (; eablk < end; eablk++)
+			if (!*eablk)
+				break;
+
+		if (eablk == end) {
+			error = -ENOSPC;
+			goto out;
+		}
+
+		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+	} else {
+		u64 blk;
+
+		blk = gfs2_alloc_meta(ip);
+
+		indbh = gfs2_meta_new(ip->i_gl, blk);
+		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+		gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+		gfs2_buffer_clear_tail(indbh, mh_size);
+
+		eablk = (u64 *)(indbh->b_data + mh_size);
+		*eablk = cpu_to_be64(ip->i_di.di_eattr);
+		ip->i_di.di_eattr = blk;
+		ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
+		ip->i_di.di_blocks++;
+
+		eablk++;
+	}
+
+	error = ea_alloc_blk(ip, &newbh);
+	if (error)
+		goto out;
+
+	*eablk = cpu_to_be64((u64)newbh->b_blocknr);
+	error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er);
+	brelse(newbh);
+	if (error)
+		goto out;
+
+	if (private)
+		ea_set_remove_stuffed(ip, private);
+
+out:
+	brelse(indbh);
+	return error;
+}
+
+static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+		    struct gfs2_ea_location *el)
+{
+	struct ea_set es;
+	unsigned int blks = 2;
+	int error;
+
+	memset(&es, 0, sizeof(struct ea_set));
+	es.es_er = er;
+	es.es_el = el;
+
+	error = ea_foreach(ip, ea_set_simple, &es);
+	if (error > 0)
+		return 0;
+	if (error)
+		return error;
+
+	if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
+		blks++;
+	if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
+		blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
+
+	return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
+}
+
+static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
+				   struct gfs2_ea_location *el)
+{
+	if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) {
+		el->el_prev = GFS2_EA2NEXT(el->el_prev);
+		gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
+				     GFS2_EA2NEXT(el->el_prev) == el->el_ea);
+	}
+
+	return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
+}
+
+int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+	struct gfs2_ea_location el;
+	int error;
+
+	if (!ip->i_di.di_eattr) {
+		if (er->er_flags & XATTR_REPLACE)
+			return -ENODATA;
+		return ea_init(ip, er);
+	}
+
+	error = gfs2_ea_find(ip, er, &el);
+	if (error)
+		return error;
+
+	if (el.el_ea) {
+		if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
+			brelse(el.el_bh);
+			return -EPERM;
+		}
+
+		error = -EEXIST;
+		if (!(er->er_flags & XATTR_CREATE)) {
+			int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
+			error = ea_set_i(ip, er, &el);
+			if (!error && unstuffed)
+				ea_set_remove_unstuffed(ip, &el);
+		}
+
+		brelse(el.el_bh);
+	} else {
+		error = -ENODATA;
+		if (!(er->er_flags & XATTR_REPLACE))
+			error = ea_set_i(ip, er, NULL);
+	}
+
+	return error;
+}
+
+int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+	struct gfs2_holder i_gh;
+	int error;
+
+	if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+		return -EINVAL;
+	if (!er->er_data || !er->er_data_len) {
+		er->er_data = NULL;
+		er->er_data_len = 0;
+	}
+	error = ea_check_size(GFS2_SB(&ip->i_inode), er);
+	if (error)
+		return error;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+	if (error)
+		return error;
+
+	if (IS_IMMUTABLE(&ip->i_inode))
+		error = -EPERM;
+	else
+		error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
+
+	gfs2_glock_dq_uninit(&i_gh);
+
+	return error;
+}
+
+static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
+{
+	struct gfs2_ea_header *ea = el->el_ea;
+	struct gfs2_ea_header *prev = el->el_prev;
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
+	if (error)
+		return error;
+
+	gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+
+	if (prev) {
+		u32 len;
+
+		len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea);
+		prev->ea_rec_len = cpu_to_be32(len);
+
+		if (GFS2_EA_IS_LAST(ea))
+			prev->ea_flags |= GFS2_EAFLAG_LAST;
+	} else
+		ea->ea_type = GFS2_EATYPE_UNUSED;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		ip->i_di.di_ctime = get_seconds();
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+
+	return error;
+}
+
+int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+	struct gfs2_ea_location el;
+	int error;
+
+	if (!ip->i_di.di_eattr)
+		return -ENODATA;
+
+	error = gfs2_ea_find(ip, er, &el);
+	if (error)
+		return error;
+	if (!el.el_ea)
+		return -ENODATA;
+
+	if (GFS2_EA_IS_STUFFED(el.el_ea))
+		error = ea_remove_stuffed(ip, &el);
+	else
+		error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
+					    0);
+
+	brelse(el.el_bh);
+
+	return error;
+}
+
+/**
+ * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
+ * @ip: pointer to the inode of the target file
+ * @er: request information
+ *
+ * Returns: errno
+ */
+
+int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+{
+	struct gfs2_holder i_gh;
+	int error;
+
+	if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+		return -EINVAL;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+	if (error)
+		return error;
+
+	if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+		error = -EPERM;
+	else
+		error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
+
+	gfs2_glock_dq_uninit(&i_gh);
+
+	return error;
+}
+
+static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
+				  struct gfs2_ea_header *ea, char *data)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head **bh;
+	unsigned int amount = GFS2_EA_DATA_LEN(ea);
+	unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
+	u64 *dataptrs = GFS2_EA2DATAPTRS(ea);
+	unsigned int x;
+	int error;
+
+	bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL);
+	if (!bh)
+		return -ENOMEM;
+
+	error = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
+	if (error)
+		goto out;
+
+	for (x = 0; x < nptrs; x++) {
+		error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+				       bh + x);
+		if (error) {
+			while (x--)
+				brelse(bh[x]);
+			goto fail;
+		}
+		dataptrs++;
+	}
+
+	for (x = 0; x < nptrs; x++) {
+		error = gfs2_meta_wait(sdp, bh[x]);
+		if (error) {
+			for (; x < nptrs; x++)
+				brelse(bh[x]);
+			goto fail;
+		}
+		if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) {
+			for (; x < nptrs; x++)
+				brelse(bh[x]);
+			error = -EIO;
+			goto fail;
+		}
+
+		gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
+
+		memcpy(bh[x]->b_data + sizeof(struct gfs2_meta_header), data,
+		       (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize);
+
+		amount -= sdp->sd_jbsize;
+		data += sdp->sd_jbsize;
+
+		brelse(bh[x]);
+	}
+
+out:
+	kfree(bh);
+	return error;
+
+fail:
+	gfs2_trans_end(sdp);
+	kfree(bh);
+	return error;
+}
+
+int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+		      struct iattr *attr, char *data)
+{
+	struct buffer_head *dibh;
+	int error;
+
+	if (GFS2_EA_IS_STUFFED(el->el_ea)) {
+		error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0);
+		if (error)
+			return error;
+
+		gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+		memcpy(GFS2_EA2DATA(el->el_ea), data,
+		       GFS2_EA_DATA_LEN(el->el_ea));
+	} else
+		error = ea_acl_chmod_unstuffed(ip, el->el_ea, data);
+
+	if (error)
+		return error;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		error = inode_setattr(&ip->i_inode, attr);
+		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
+		gfs2_inode_attr_out(ip);
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+
+	return error;
+}
+
+static int ea_dealloc_indirect(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrp_list rlist;
+	struct buffer_head *indbh, *dibh;
+	u64 *eablk, *end;
+	unsigned int rg_blocks = 0;
+	u64 bstart = 0;
+	unsigned int blen = 0;
+	unsigned int blks = 0;
+	unsigned int x;
+	int error;
+
+	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
+
+	error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh);
+	if (error)
+		return error;
+
+	if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) {
+		error = -EIO;
+		goto out;
+	}
+
+	eablk = (u64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+	end = eablk + sdp->sd_inptrs;
+
+	for (; eablk < end; eablk++) {
+		u64 bn;
+
+		if (!*eablk)
+			break;
+		bn = be64_to_cpu(*eablk);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart)
+				gfs2_rlist_add(sdp, &rlist, bstart);
+			bstart = bn;
+			blen = 1;
+		}
+		blks++;
+	}
+	if (bstart)
+		gfs2_rlist_add(sdp, &rlist, bstart);
+	else
+		goto out;
+
+	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0);
+
+	for (x = 0; x < rlist.rl_rgrps; x++) {
+		struct gfs2_rgrpd *rgd;
+		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
+		rg_blocks += rgd->rd_ri.ri_length;
+	}
+
+	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
+	if (error)
+		goto out_rlist_free;
+
+	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT +
+				 RES_STATFS + RES_QUOTA, blks);
+	if (error)
+		goto out_gunlock;
+
+	gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+
+	eablk = (u64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
+	bstart = 0;
+	blen = 0;
+
+	for (; eablk < end; eablk++) {
+		u64 bn;
+
+		if (!*eablk)
+			break;
+		bn = be64_to_cpu(*eablk);
+
+		if (bstart + blen == bn)
+			blen++;
+		else {
+			if (bstart)
+				gfs2_free_meta(ip, bstart, blen);
+			bstart = bn;
+			blen = 1;
+		}
+
+		*eablk = 0;
+		if (!ip->i_di.di_blocks)
+			gfs2_consist_inode(ip);
+		ip->i_di.di_blocks--;
+	}
+	if (bstart)
+		gfs2_free_meta(ip, bstart, blen);
+
+	ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(sdp);
+
+out_gunlock:
+	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
+out_rlist_free:
+	gfs2_rlist_free(&rlist);
+out:
+	brelse(indbh);
+	return error;
+}
+
+static int ea_dealloc_block(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_rgrpd *rgd;
+	struct buffer_head *dibh;
+	int error;
+
+	rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
+	if (!rgd) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
+				   &al->al_rgd_gh);
+	if (error)
+		return error;
+
+	error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE + RES_STATFS +
+				 RES_QUOTA, 1);
+	if (error)
+		goto out_gunlock;
+
+	gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
+
+	ip->i_di.di_eattr = 0;
+	if (!ip->i_di.di_blocks)
+		gfs2_consist_inode(ip);
+	ip->i_di.di_blocks--;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(sdp);
+
+out_gunlock:
+	gfs2_glock_dq_uninit(&al->al_rgd_gh);
+	return error;
+}
+
+/**
+ * gfs2_ea_dealloc - deallocate the extended attribute fork
+ * @ip: the inode
+ *
+ * Returns: errno
+ */
+
+int gfs2_ea_dealloc(struct gfs2_inode *ip)
+{
+	struct gfs2_alloc *al;
+	int error;
+
+	al = gfs2_alloc_get(ip);
+
+	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto out_alloc;
+
+	error = gfs2_rindex_hold(GFS2_SB(&ip->i_inode), &al->al_ri_gh);
+	if (error)
+		goto out_quota;
+
+	error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
+	if (error)
+		goto out_rindex;
+
+	if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+		error = ea_dealloc_indirect(ip);
+		if (error)
+			goto out_rindex;
+	}
+
+	error = ea_dealloc_block(ip);
+
+out_rindex:
+	gfs2_glock_dq_uninit(&al->al_ri_gh);
+out_quota:
+	gfs2_quota_unhold(ip);
+out_alloc:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/eattr.h
new file mode 100644
index 000000000000..ffa65947d686
--- /dev/null
+++ b/fs/gfs2/eattr.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __EATTR_DOT_H__
+#define __EATTR_DOT_H__
+
+struct gfs2_inode;
+struct iattr;
+
+#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len)
+#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len)
+
+#define GFS2_EA_SIZE(ea) \
+ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
+      ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
+                                  (sizeof(u64) * (ea)->ea_num_ptrs)), 8)
+
+#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
+#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
+
+#define GFS2_EAREQ_SIZE_STUFFED(er) \
+ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
+
+#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
+ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
+      sizeof(u64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
+
+#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
+#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
+
+#define GFS2_EA2DATAPTRS(ea) \
+((u64 *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8)))
+
+#define GFS2_EA2NEXT(ea) \
+((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea)))
+
+#define GFS2_EA_BH2FIRST(bh) \
+((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
+
+#define GFS2_ERF_MODE 0x80000000
+
+struct gfs2_ea_request {
+	const char *er_name;
+	char *er_data;
+	unsigned int er_name_len;
+	unsigned int er_data_len;
+	unsigned int er_type; /* GFS2_EATYPE_... */
+	int er_flags;
+	mode_t er_mode;
+};
+
+struct gfs2_ea_location {
+	struct buffer_head *el_bh;
+	struct gfs2_ea_header *el_ea;
+	struct gfs2_ea_header *el_prev;
+};
+
+int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+
+int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+
+int gfs2_ea_dealloc(struct gfs2_inode *ip);
+
+/* Exported to acl.c */
+
+int gfs2_ea_find(struct gfs2_inode *ip,
+		 struct gfs2_ea_request *er,
+		 struct gfs2_ea_location *el);
+int gfs2_ea_get_copy(struct gfs2_inode *ip,
+		     struct gfs2_ea_location *el,
+		     char *data);
+int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
+		      struct iattr *attr, char *data);
+
+static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
+{
+	switch (ea->ea_type) {
+	case GFS2_EATYPE_USR:
+		return 5 + ea->ea_name_len + 1;
+	case GFS2_EATYPE_SYS:
+		return 7 + ea->ea_name_len + 1;
+	case GFS2_EATYPE_SECURITY:
+		return 9 + ea->ea_name_len + 1;
+	default:
+		return 0;
+	}
+}
+
+#endif /* __EATTR_DOT_H__ */
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
new file mode 100644
index 000000000000..3bb11c0f8b56
--- /dev/null
+++ b/fs/gfs2/gfs2.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __GFS2_DOT_H__
+#define __GFS2_DOT_H__
+
+enum {
+	NO_CREATE = 0,
+	CREATE = 1,
+};
+
+enum {
+	NO_WAIT = 0,
+	WAIT = 1,
+};
+
+enum {
+	NO_FORCE = 0,
+	FORCE = 1,
+};
+
+#define GFS2_FAST_NAME_SIZE 8
+
+#endif /* __GFS2_DOT_H__ */
+
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
new file mode 100644
index 000000000000..78fe0fae23ff
--- /dev/null
+++ b/fs/gfs2/glock.c
@@ -0,0 +1,2231 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/delay.h>
+#include <linux/sort.h>
+#include <linux/jhash.h>
+#include <linux/kallsyms.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/list.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "lm.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "super.h"
+#include "util.h"
+
+struct greedy {
+	struct gfs2_holder gr_gh;
+	struct work_struct gr_work;
+};
+
+struct gfs2_gl_hash_bucket {
+        struct hlist_head hb_list;
+};
+
+typedef void (*glock_examiner) (struct gfs2_glock * gl);
+
+static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
+static int dump_glock(struct gfs2_glock *gl);
+static int dump_inode(struct gfs2_inode *ip);
+
+#define GFS2_GL_HASH_SHIFT      15
+#define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
+#define GFS2_GL_HASH_MASK       (GFS2_GL_HASH_SIZE - 1)
+
+static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE];
+
+/*
+ * Despite what you might think, the numbers below are not arbitrary :-)
+ * They are taken from the ipv4 routing hash code, which is well tested
+ * and thus should be nearly optimal. Later on we might tweek the numbers
+ * but for now this should be fine.
+ *
+ * The reason for putting the locks in a separate array from the list heads
+ * is that we can have fewer locks than list heads and save memory. We use
+ * the same hash function for both, but with a different hash mask.
+ */
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
+	defined(CONFIG_PROVE_LOCKING)
+
+#ifdef CONFIG_LOCKDEP
+# define GL_HASH_LOCK_SZ        256
+#else
+# if NR_CPUS >= 32
+#  define GL_HASH_LOCK_SZ       4096
+# elif NR_CPUS >= 16
+#  define GL_HASH_LOCK_SZ       2048
+# elif NR_CPUS >= 8
+#  define GL_HASH_LOCK_SZ       1024
+# elif NR_CPUS >= 4
+#  define GL_HASH_LOCK_SZ       512
+# else
+#  define GL_HASH_LOCK_SZ       256
+# endif
+#endif
+
+/* We never want more locks than chains */
+#if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ
+# undef GL_HASH_LOCK_SZ
+# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE
+#endif
+
+static rwlock_t gl_hash_locks[GL_HASH_LOCK_SZ];
+
+static inline rwlock_t *gl_lock_addr(unsigned int x)
+{
+	return &gl_hash_locks[x & (GL_HASH_LOCK_SZ-1)];
+}
+#else /* not SMP, so no spinlocks required */
+static inline rwlock_t *gl_lock_addr(x)
+{
+	return NULL;
+}
+#endif
+
+/**
+ * relaxed_state_ok - is a requested lock compatible with the current lock mode?
+ * @actual: the current state of the lock
+ * @requested: the lock state that was requested by the caller
+ * @flags: the modifier flags passed in by the caller
+ *
+ * Returns: 1 if the locks are compatible, 0 otherwise
+ */
+
+static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
+				   int flags)
+{
+	if (actual == requested)
+		return 1;
+
+	if (flags & GL_EXACT)
+		return 0;
+
+	if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
+		return 1;
+
+	if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
+		return 1;
+
+	return 0;
+}
+
+/**
+ * gl_hash() - Turn glock number into hash bucket number
+ * @lock: The glock number
+ *
+ * Returns: The number of the corresponding hash bucket
+ */
+
+static unsigned int gl_hash(const struct gfs2_sbd *sdp,
+			    const struct lm_lockname *name)
+{
+	unsigned int h;
+
+	h = jhash(&name->ln_number, sizeof(u64), 0);
+	h = jhash(&name->ln_type, sizeof(unsigned int), h);
+	h = jhash(&sdp, sizeof(struct gfs2_sbd *), h);
+	h &= GFS2_GL_HASH_MASK;
+
+	return h;
+}
+
+/**
+ * glock_free() - Perform a few checks and then release struct gfs2_glock
+ * @gl: The glock to release
+ *
+ * Also calls lock module to release its internal structure for this glock.
+ *
+ */
+
+static void glock_free(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct inode *aspace = gl->gl_aspace;
+
+	gfs2_lm_put_lock(sdp, gl->gl_lock);
+
+	if (aspace)
+		gfs2_aspace_put(aspace);
+
+	kmem_cache_free(gfs2_glock_cachep, gl);
+}
+
+/**
+ * gfs2_glock_hold() - increment reference count on glock
+ * @gl: The glock to hold
+ *
+ */
+
+void gfs2_glock_hold(struct gfs2_glock *gl)
+{
+	atomic_inc(&gl->gl_ref);
+}
+
+/**
+ * gfs2_glock_put() - Decrement reference count on glock
+ * @gl: The glock to put
+ *
+ */
+
+int gfs2_glock_put(struct gfs2_glock *gl)
+{
+	int rv = 0;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+
+	write_lock(gl_lock_addr(gl->gl_hash));
+	if (atomic_dec_and_test(&gl->gl_ref)) {
+		hlist_del(&gl->gl_list);
+		write_unlock(gl_lock_addr(gl->gl_hash));
+		BUG_ON(spin_is_locked(&gl->gl_spin));
+		gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
+		gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
+		gfs2_assert(sdp, list_empty(&gl->gl_holders));
+		gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
+		gfs2_assert(sdp, list_empty(&gl->gl_waiters2));
+		gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
+		glock_free(gl);
+		rv = 1;
+		goto out;
+	}
+	write_unlock(gl_lock_addr(gl->gl_hash));
+out:
+	return rv;
+}
+
+/**
+ * queue_empty - check to see if a glock's queue is empty
+ * @gl: the glock
+ * @head: the head of the queue to check
+ *
+ * This function protects the list in the event that a process already
+ * has a holder on the list and is adding a second holder for itself.
+ * The glmutex lock is what generally prevents processes from working
+ * on the same glock at once, but the special case of adding a second
+ * holder for yourself ("recursive" locking) doesn't involve locking
+ * glmutex, making the spin lock necessary.
+ *
+ * Returns: 1 if the queue is empty
+ */
+
+static inline int queue_empty(struct gfs2_glock *gl, struct list_head *head)
+{
+	int empty;
+	spin_lock(&gl->gl_spin);
+	empty = list_empty(head);
+	spin_unlock(&gl->gl_spin);
+	return empty;
+}
+
+/**
+ * search_bucket() - Find struct gfs2_glock by lock number
+ * @bucket: the bucket to search
+ * @name: The lock name
+ *
+ * Returns: NULL, or the struct gfs2_glock with the requested number
+ */
+
+static struct gfs2_glock *search_bucket(unsigned int hash,
+					const struct gfs2_sbd *sdp,
+					const struct lm_lockname *name)
+{
+	struct gfs2_glock *gl;
+	struct hlist_node *h;
+
+	hlist_for_each_entry(gl, h, &gl_hash_table[hash].hb_list, gl_list) {
+		if (!lm_name_equal(&gl->gl_name, name))
+			continue;
+		if (gl->gl_sbd != sdp)
+			continue;
+
+		atomic_inc(&gl->gl_ref);
+
+		return gl;
+	}
+
+	return NULL;
+}
+
+/**
+ * gfs2_glock_find() - Find glock by lock number
+ * @sdp: The GFS2 superblock
+ * @name: The lock name
+ *
+ * Returns: NULL, or the struct gfs2_glock with the requested number
+ */
+
+static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
+					  const struct lm_lockname *name)
+{
+	unsigned int hash = gl_hash(sdp, name);
+	struct gfs2_glock *gl;
+
+	read_lock(gl_lock_addr(hash));
+	gl = search_bucket(hash, sdp, name);
+	read_unlock(gl_lock_addr(hash));
+
+	return gl;
+}
+
+/**
+ * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
+ * @sdp: The GFS2 superblock
+ * @number: the lock number
+ * @glops: The glock_operations to use
+ * @create: If 0, don't create the glock if it doesn't exist
+ * @glp: the glock is returned here
+ *
+ * This does not lock a glock, just finds/creates structures for one.
+ *
+ * Returns: errno
+ */
+
+int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
+		   const struct gfs2_glock_operations *glops, int create,
+		   struct gfs2_glock **glp)
+{
+	struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type };
+	struct gfs2_glock *gl, *tmp;
+	unsigned int hash = gl_hash(sdp, &name);
+	int error;
+
+	read_lock(gl_lock_addr(hash));
+	gl = search_bucket(hash, sdp, &name);
+	read_unlock(gl_lock_addr(hash));
+
+	if (gl || !create) {
+		*glp = gl;
+		return 0;
+	}
+
+	gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
+	if (!gl)
+		return -ENOMEM;
+
+	gl->gl_flags = 0;
+	gl->gl_name = name;
+	atomic_set(&gl->gl_ref, 1);
+	gl->gl_state = LM_ST_UNLOCKED;
+	gl->gl_hash = hash;
+	gl->gl_owner = NULL;
+	gl->gl_ip = 0;
+	gl->gl_ops = glops;
+	gl->gl_req_gh = NULL;
+	gl->gl_req_bh = NULL;
+	gl->gl_vn = 0;
+	gl->gl_stamp = jiffies;
+	gl->gl_object = NULL;
+	gl->gl_sbd = sdp;
+	gl->gl_aspace = NULL;
+	lops_init_le(&gl->gl_le, &gfs2_glock_lops);
+
+	/* If this glock protects actual on-disk data or metadata blocks,
+	   create a VFS inode to manage the pages/buffers holding them. */
+	if (glops == &gfs2_inode_glops || glops == &gfs2_rgrp_glops) {
+		gl->gl_aspace = gfs2_aspace_get(sdp);
+		if (!gl->gl_aspace) {
+			error = -ENOMEM;
+			goto fail;
+		}
+	}
+
+	error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock);
+	if (error)
+		goto fail_aspace;
+
+	write_lock(gl_lock_addr(hash));
+	tmp = search_bucket(hash, sdp, &name);
+	if (tmp) {
+		write_unlock(gl_lock_addr(hash));
+		glock_free(gl);
+		gl = tmp;
+	} else {
+		hlist_add_head(&gl->gl_list, &gl_hash_table[hash].hb_list);
+		write_unlock(gl_lock_addr(hash));
+	}
+
+	*glp = gl;
+
+	return 0;
+
+fail_aspace:
+	if (gl->gl_aspace)
+		gfs2_aspace_put(gl->gl_aspace);
+fail:
+	kmem_cache_free(gfs2_glock_cachep, gl);
+	return error;
+}
+
+/**
+ * gfs2_holder_init - initialize a struct gfs2_holder in the default way
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ */
+
+void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
+		      struct gfs2_holder *gh)
+{
+	INIT_LIST_HEAD(&gh->gh_list);
+	gh->gh_gl = gl;
+	gh->gh_ip = (unsigned long)__builtin_return_address(0);
+	gh->gh_owner = current;
+	gh->gh_state = state;
+	gh->gh_flags = flags;
+	gh->gh_error = 0;
+	gh->gh_iflags = 0;
+	init_completion(&gh->gh_wait);
+
+	if (gh->gh_state == LM_ST_EXCLUSIVE)
+		gh->gh_flags |= GL_LOCAL_EXCL;
+
+	gfs2_glock_hold(gl);
+}
+
+/**
+ * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ * Don't mess with the glock.
+ *
+ */
+
+void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh)
+{
+	gh->gh_state = state;
+	gh->gh_flags = flags;
+	if (gh->gh_state == LM_ST_EXCLUSIVE)
+		gh->gh_flags |= GL_LOCAL_EXCL;
+
+	gh->gh_iflags &= 1 << HIF_ALLOCED;
+	gh->gh_ip = (unsigned long)__builtin_return_address(0);
+}
+
+/**
+ * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference)
+ * @gh: the holder structure
+ *
+ */
+
+void gfs2_holder_uninit(struct gfs2_holder *gh)
+{
+	gfs2_glock_put(gh->gh_gl);
+	gh->gh_gl = NULL;
+	gh->gh_ip = 0;
+}
+
+/**
+ * gfs2_holder_get - get a struct gfs2_holder structure
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gfp_flags:
+ *
+ * Figure out how big an impact this function has.  Either:
+ * 1) Replace it with a cache of structures hanging off the struct gfs2_sbd
+ * 2) Leave it like it is
+ *
+ * Returns: the holder structure, NULL on ENOMEM
+ */
+
+static struct gfs2_holder *gfs2_holder_get(struct gfs2_glock *gl,
+					   unsigned int state,
+					   int flags, gfp_t gfp_flags)
+{
+	struct gfs2_holder *gh;
+
+	gh = kmalloc(sizeof(struct gfs2_holder), gfp_flags);
+	if (!gh)
+		return NULL;
+
+	gfs2_holder_init(gl, state, flags, gh);
+	set_bit(HIF_ALLOCED, &gh->gh_iflags);
+	gh->gh_ip = (unsigned long)__builtin_return_address(0);
+	return gh;
+}
+
+/**
+ * gfs2_holder_put - get rid of a struct gfs2_holder structure
+ * @gh: the holder structure
+ *
+ */
+
+static void gfs2_holder_put(struct gfs2_holder *gh)
+{
+	gfs2_holder_uninit(gh);
+	kfree(gh);
+}
+
+/**
+ * rq_mutex - process a mutex request in the queue
+ * @gh: the glock holder
+ *
+ * Returns: 1 if the queue is blocked
+ */
+
+static int rq_mutex(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+
+	list_del_init(&gh->gh_list);
+	/*  gh->gh_error never examined.  */
+	set_bit(GLF_LOCK, &gl->gl_flags);
+	complete(&gh->gh_wait);
+
+	return 1;
+}
+
+/**
+ * rq_promote - process a promote request in the queue
+ * @gh: the glock holder
+ *
+ * Acquire a new inter-node lock, or change a lock state to more restrictive.
+ *
+ * Returns: 1 if the queue is blocked
+ */
+
+static int rq_promote(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+	if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
+		if (list_empty(&gl->gl_holders)) {
+			gl->gl_req_gh = gh;
+			set_bit(GLF_LOCK, &gl->gl_flags);
+			spin_unlock(&gl->gl_spin);
+
+			if (atomic_read(&sdp->sd_reclaim_count) >
+			    gfs2_tune_get(sdp, gt_reclaim_limit) &&
+			    !(gh->gh_flags & LM_FLAG_PRIORITY)) {
+				gfs2_reclaim_glock(sdp);
+				gfs2_reclaim_glock(sdp);
+			}
+
+			glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
+			spin_lock(&gl->gl_spin);
+		}
+		return 1;
+	}
+
+	if (list_empty(&gl->gl_holders)) {
+		set_bit(HIF_FIRST, &gh->gh_iflags);
+		set_bit(GLF_LOCK, &gl->gl_flags);
+	} else {
+		struct gfs2_holder *next_gh;
+		if (gh->gh_flags & GL_LOCAL_EXCL)
+			return 1;
+		next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
+				     gh_list);
+		if (next_gh->gh_flags & GL_LOCAL_EXCL)
+			 return 1;
+	}
+
+	list_move_tail(&gh->gh_list, &gl->gl_holders);
+	gh->gh_error = 0;
+	set_bit(HIF_HOLDER, &gh->gh_iflags);
+
+	complete(&gh->gh_wait);
+
+	return 0;
+}
+
+/**
+ * rq_demote - process a demote request in the queue
+ * @gh: the glock holder
+ *
+ * Returns: 1 if the queue is blocked
+ */
+
+static int rq_demote(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+	if (!list_empty(&gl->gl_holders))
+		return 1;
+
+	if (gl->gl_state == gh->gh_state || gl->gl_state == LM_ST_UNLOCKED) {
+		list_del_init(&gh->gh_list);
+		gh->gh_error = 0;
+		spin_unlock(&gl->gl_spin);
+		if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+			gfs2_holder_put(gh);
+		else
+			complete(&gh->gh_wait);
+		spin_lock(&gl->gl_spin);
+	} else {
+		gl->gl_req_gh = gh;
+		set_bit(GLF_LOCK, &gl->gl_flags);
+		spin_unlock(&gl->gl_spin);
+
+		if (gh->gh_state == LM_ST_UNLOCKED ||
+		    gl->gl_state != LM_ST_EXCLUSIVE)
+			glops->go_drop_th(gl);
+		else
+			glops->go_xmote_th(gl, gh->gh_state, gh->gh_flags);
+
+		spin_lock(&gl->gl_spin);
+	}
+
+	return 0;
+}
+
+/**
+ * rq_greedy - process a queued request to drop greedy status
+ * @gh: the glock holder
+ *
+ * Returns: 1 if the queue is blocked
+ */
+
+static int rq_greedy(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+
+	list_del_init(&gh->gh_list);
+	/*  gh->gh_error never examined.  */
+	clear_bit(GLF_GREEDY, &gl->gl_flags);
+	spin_unlock(&gl->gl_spin);
+
+	gfs2_holder_uninit(gh);
+	kfree(container_of(gh, struct greedy, gr_gh));
+
+	spin_lock(&gl->gl_spin);
+
+	return 0;
+}
+
+/**
+ * run_queue - process holder structures on a glock
+ * @gl: the glock
+ *
+ */
+static void run_queue(struct gfs2_glock *gl)
+{
+	struct gfs2_holder *gh;
+	int blocked = 1;
+
+	for (;;) {
+		if (test_bit(GLF_LOCK, &gl->gl_flags))
+			break;
+
+		if (!list_empty(&gl->gl_waiters1)) {
+			gh = list_entry(gl->gl_waiters1.next,
+					struct gfs2_holder, gh_list);
+
+			if (test_bit(HIF_MUTEX, &gh->gh_iflags))
+				blocked = rq_mutex(gh);
+			else
+				gfs2_assert_warn(gl->gl_sbd, 0);
+
+		} else if (!list_empty(&gl->gl_waiters2) &&
+			   !test_bit(GLF_SKIP_WAITERS2, &gl->gl_flags)) {
+			gh = list_entry(gl->gl_waiters2.next,
+					struct gfs2_holder, gh_list);
+
+			if (test_bit(HIF_DEMOTE, &gh->gh_iflags))
+				blocked = rq_demote(gh);
+			else if (test_bit(HIF_GREEDY, &gh->gh_iflags))
+				blocked = rq_greedy(gh);
+			else
+				gfs2_assert_warn(gl->gl_sbd, 0);
+
+		} else if (!list_empty(&gl->gl_waiters3)) {
+			gh = list_entry(gl->gl_waiters3.next,
+					struct gfs2_holder, gh_list);
+
+			if (test_bit(HIF_PROMOTE, &gh->gh_iflags))
+				blocked = rq_promote(gh);
+			else
+				gfs2_assert_warn(gl->gl_sbd, 0);
+
+		} else
+			break;
+
+		if (blocked)
+			break;
+	}
+}
+
+/**
+ * gfs2_glmutex_lock - acquire a local lock on a glock
+ * @gl: the glock
+ *
+ * Gives caller exclusive access to manipulate a glock structure.
+ */
+
+static void gfs2_glmutex_lock(struct gfs2_glock *gl)
+{
+	struct gfs2_holder gh;
+
+	gfs2_holder_init(gl, 0, 0, &gh);
+	set_bit(HIF_MUTEX, &gh.gh_iflags);
+
+	spin_lock(&gl->gl_spin);
+	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+		list_add_tail(&gh.gh_list, &gl->gl_waiters1);
+	} else {
+		gl->gl_owner = current;
+		gl->gl_ip = (unsigned long)__builtin_return_address(0);
+		complete(&gh.gh_wait);
+	}
+	spin_unlock(&gl->gl_spin);
+
+	wait_for_completion(&gh.gh_wait);
+	gfs2_holder_uninit(&gh);
+}
+
+/**
+ * gfs2_glmutex_trylock - try to acquire a local lock on a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if the glock is acquired
+ */
+
+static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
+{
+	int acquired = 1;
+
+	spin_lock(&gl->gl_spin);
+	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+		acquired = 0;
+	} else {
+		gl->gl_owner = current;
+		gl->gl_ip = (unsigned long)__builtin_return_address(0);
+	}
+	spin_unlock(&gl->gl_spin);
+
+	return acquired;
+}
+
+/**
+ * gfs2_glmutex_unlock - release a local lock on a glock
+ * @gl: the glock
+ *
+ */
+
+static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
+{
+	spin_lock(&gl->gl_spin);
+	clear_bit(GLF_LOCK, &gl->gl_flags);
+	gl->gl_owner = NULL;
+	gl->gl_ip = 0;
+	run_queue(gl);
+	BUG_ON(!spin_is_locked(&gl->gl_spin));
+	spin_unlock(&gl->gl_spin);
+}
+
+/**
+ * handle_callback - add a demote request to a lock's queue
+ * @gl: the glock
+ * @state: the state the caller wants us to change to
+ *
+ * Note: This may fail sliently if we are out of memory.
+ */
+
+static void handle_callback(struct gfs2_glock *gl, unsigned int state)
+{
+	struct gfs2_holder *gh, *new_gh = NULL;
+
+restart:
+	spin_lock(&gl->gl_spin);
+
+	list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
+		if (test_bit(HIF_DEMOTE, &gh->gh_iflags) &&
+		    gl->gl_req_gh != gh) {
+			if (gh->gh_state != state)
+				gh->gh_state = LM_ST_UNLOCKED;
+			goto out;
+		}
+	}
+
+	if (new_gh) {
+		list_add_tail(&new_gh->gh_list, &gl->gl_waiters2);
+		new_gh = NULL;
+	} else {
+		spin_unlock(&gl->gl_spin);
+
+		new_gh = gfs2_holder_get(gl, state, LM_FLAG_TRY, GFP_KERNEL);
+		if (!new_gh)
+			return;
+		set_bit(HIF_DEMOTE, &new_gh->gh_iflags);
+		set_bit(HIF_DEALLOC, &new_gh->gh_iflags);
+
+		goto restart;
+	}
+
+out:
+	spin_unlock(&gl->gl_spin);
+
+	if (new_gh)
+		gfs2_holder_put(new_gh);
+}
+
+void gfs2_glock_inode_squish(struct inode *inode)
+{
+	struct gfs2_holder gh;
+	struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
+	gfs2_holder_init(gl, LM_ST_UNLOCKED, 0, &gh);
+	set_bit(HIF_DEMOTE, &gh.gh_iflags);
+	spin_lock(&gl->gl_spin);
+	gfs2_assert(inode->i_sb->s_fs_info, list_empty(&gl->gl_holders));
+	list_add_tail(&gh.gh_list, &gl->gl_waiters2);
+	run_queue(gl);
+	spin_unlock(&gl->gl_spin);
+	wait_for_completion(&gh.gh_wait);
+	gfs2_holder_uninit(&gh);
+}
+
+/**
+ * state_change - record that the glock is now in a different state
+ * @gl: the glock
+ * @new_state the new state
+ *
+ */
+
+static void state_change(struct gfs2_glock *gl, unsigned int new_state)
+{
+	int held1, held2;
+
+	held1 = (gl->gl_state != LM_ST_UNLOCKED);
+	held2 = (new_state != LM_ST_UNLOCKED);
+
+	if (held1 != held2) {
+		if (held2)
+			gfs2_glock_hold(gl);
+		else
+			gfs2_glock_put(gl);
+	}
+
+	gl->gl_state = new_state;
+}
+
+/**
+ * xmote_bh - Called after the lock module is done acquiring a lock
+ * @gl: The glock in question
+ * @ret: the int returned from the lock module
+ *
+ */
+
+static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	struct gfs2_holder *gh = gl->gl_req_gh;
+	int prev_state = gl->gl_state;
+	int op_done = 1;
+
+	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+	gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+	gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
+
+	state_change(gl, ret & LM_OUT_ST_MASK);
+
+	if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
+		if (glops->go_inval)
+			glops->go_inval(gl, DIO_METADATA | DIO_DATA);
+	} else if (gl->gl_state == LM_ST_DEFERRED) {
+		/* We might not want to do this here.
+		   Look at moving to the inode glops. */
+		if (glops->go_inval)
+			glops->go_inval(gl, DIO_DATA);
+	}
+
+	/*  Deal with each possible exit condition  */
+
+	if (!gh)
+		gl->gl_stamp = jiffies;
+	else if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+		spin_lock(&gl->gl_spin);
+		list_del_init(&gh->gh_list);
+		gh->gh_error = -EIO;
+		spin_unlock(&gl->gl_spin);
+	} else if (test_bit(HIF_DEMOTE, &gh->gh_iflags)) {
+		spin_lock(&gl->gl_spin);
+		list_del_init(&gh->gh_list);
+		if (gl->gl_state == gh->gh_state ||
+		    gl->gl_state == LM_ST_UNLOCKED) {
+			gh->gh_error = 0;
+		} else {
+			if (gfs2_assert_warn(sdp, gh->gh_flags &
+					(LM_FLAG_TRY | LM_FLAG_TRY_1CB)) == -1)
+				fs_warn(sdp, "ret = 0x%.8X\n", ret);
+			gh->gh_error = GLR_TRYFAILED;
+		}
+		spin_unlock(&gl->gl_spin);
+
+		if (ret & LM_OUT_CANCELED)
+			handle_callback(gl, LM_ST_UNLOCKED);
+
+	} else if (ret & LM_OUT_CANCELED) {
+		spin_lock(&gl->gl_spin);
+		list_del_init(&gh->gh_list);
+		gh->gh_error = GLR_CANCELED;
+		spin_unlock(&gl->gl_spin);
+
+	} else if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
+		spin_lock(&gl->gl_spin);
+		list_move_tail(&gh->gh_list, &gl->gl_holders);
+		gh->gh_error = 0;
+		set_bit(HIF_HOLDER, &gh->gh_iflags);
+		spin_unlock(&gl->gl_spin);
+
+		set_bit(HIF_FIRST, &gh->gh_iflags);
+
+		op_done = 0;
+
+	} else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+		spin_lock(&gl->gl_spin);
+		list_del_init(&gh->gh_list);
+		gh->gh_error = GLR_TRYFAILED;
+		spin_unlock(&gl->gl_spin);
+
+	} else {
+		if (gfs2_assert_withdraw(sdp, 0) == -1)
+			fs_err(sdp, "ret = 0x%.8X\n", ret);
+	}
+
+	if (glops->go_xmote_bh)
+		glops->go_xmote_bh(gl);
+
+	if (op_done) {
+		spin_lock(&gl->gl_spin);
+		gl->gl_req_gh = NULL;
+		gl->gl_req_bh = NULL;
+		clear_bit(GLF_LOCK, &gl->gl_flags);
+		run_queue(gl);
+		spin_unlock(&gl->gl_spin);
+	}
+
+	gfs2_glock_put(gl);
+
+	if (gh) {
+		if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+			gfs2_holder_put(gh);
+		else
+			complete(&gh->gh_wait);
+	}
+}
+
+/**
+ * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
+ * @gl: The glock in question
+ * @state: the requested state
+ * @flags: modifier flags to the lock call
+ *
+ */
+
+void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
+				 LM_FLAG_NOEXP | LM_FLAG_ANY |
+				 LM_FLAG_PRIORITY);
+	unsigned int lck_ret;
+
+	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+	gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+	gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
+	gfs2_assert_warn(sdp, state != gl->gl_state);
+
+	if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
+		glops->go_sync(gl, DIO_METADATA | DIO_DATA | DIO_RELEASE);
+
+	gfs2_glock_hold(gl);
+	gl->gl_req_bh = xmote_bh;
+
+	lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
+
+	if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
+		return;
+
+	if (lck_ret & LM_OUT_ASYNC)
+		gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
+	else
+		xmote_bh(gl, lck_ret);
+}
+
+/**
+ * drop_bh - Called after a lock module unlock completes
+ * @gl: the glock
+ * @ret: the return status
+ *
+ * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
+ * Doesn't drop the reference on the glock the top half took out
+ *
+ */
+
+static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	struct gfs2_holder *gh = gl->gl_req_gh;
+
+	clear_bit(GLF_PREFETCH, &gl->gl_flags);
+
+	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+	gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+	gfs2_assert_warn(sdp, !ret);
+
+	state_change(gl, LM_ST_UNLOCKED);
+
+	if (glops->go_inval)
+		glops->go_inval(gl, DIO_METADATA | DIO_DATA);
+
+	if (gh) {
+		spin_lock(&gl->gl_spin);
+		list_del_init(&gh->gh_list);
+		gh->gh_error = 0;
+		spin_unlock(&gl->gl_spin);
+	}
+
+	if (glops->go_drop_bh)
+		glops->go_drop_bh(gl);
+
+	spin_lock(&gl->gl_spin);
+	gl->gl_req_gh = NULL;
+	gl->gl_req_bh = NULL;
+	clear_bit(GLF_LOCK, &gl->gl_flags);
+	run_queue(gl);
+	spin_unlock(&gl->gl_spin);
+
+	gfs2_glock_put(gl);
+
+	if (gh) {
+		if (test_bit(HIF_DEALLOC, &gh->gh_iflags))
+			gfs2_holder_put(gh);
+		else
+			complete(&gh->gh_wait);
+	}
+}
+
+/**
+ * gfs2_glock_drop_th - call into the lock module to unlock a lock
+ * @gl: the glock
+ *
+ */
+
+void gfs2_glock_drop_th(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	unsigned int ret;
+
+	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+	gfs2_assert_warn(sdp, queue_empty(gl, &gl->gl_holders));
+	gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
+
+	if (gl->gl_state == LM_ST_EXCLUSIVE && glops->go_sync)
+		glops->go_sync(gl, DIO_METADATA | DIO_DATA | DIO_RELEASE);
+
+	gfs2_glock_hold(gl);
+	gl->gl_req_bh = drop_bh;
+
+	ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
+
+	if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
+		return;
+
+	if (!ret)
+		drop_bh(gl, ret);
+	else
+		gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
+}
+
+/**
+ * do_cancels - cancel requests for locks stuck waiting on an expire flag
+ * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
+ *
+ * Don't cancel GL_NOCANCEL requests.
+ */
+
+static void do_cancels(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+
+	spin_lock(&gl->gl_spin);
+
+	while (gl->gl_req_gh != gh &&
+	       !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
+	       !list_empty(&gh->gh_list)) {
+		if (gl->gl_req_bh && !(gl->gl_req_gh &&
+				     (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
+			spin_unlock(&gl->gl_spin);
+			gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock);
+			msleep(100);
+			spin_lock(&gl->gl_spin);
+		} else {
+			spin_unlock(&gl->gl_spin);
+			msleep(100);
+			spin_lock(&gl->gl_spin);
+		}
+	}
+
+	spin_unlock(&gl->gl_spin);
+}
+
+/**
+ * glock_wait_internal - wait on a glock acquisition
+ * @gh: the glock holder
+ *
+ * Returns: 0 on success
+ */
+
+static int glock_wait_internal(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+	if (test_bit(HIF_ABORTED, &gh->gh_iflags))
+		return -EIO;
+
+	if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+		spin_lock(&gl->gl_spin);
+		if (gl->gl_req_gh != gh &&
+		    !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
+		    !list_empty(&gh->gh_list)) {
+			list_del_init(&gh->gh_list);
+			gh->gh_error = GLR_TRYFAILED;
+			run_queue(gl);
+			spin_unlock(&gl->gl_spin);
+			return gh->gh_error;
+		}
+		spin_unlock(&gl->gl_spin);
+	}
+
+	if (gh->gh_flags & LM_FLAG_PRIORITY)
+		do_cancels(gh);
+
+	wait_for_completion(&gh->gh_wait);
+
+	if (gh->gh_error)
+		return gh->gh_error;
+
+	gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
+	gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state,
+						   gh->gh_flags));
+
+	if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
+		gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
+
+		if (glops->go_lock) {
+			gh->gh_error = glops->go_lock(gh);
+			if (gh->gh_error) {
+				spin_lock(&gl->gl_spin);
+				list_del_init(&gh->gh_list);
+				spin_unlock(&gl->gl_spin);
+			}
+		}
+
+		spin_lock(&gl->gl_spin);
+		gl->gl_req_gh = NULL;
+		gl->gl_req_bh = NULL;
+		clear_bit(GLF_LOCK, &gl->gl_flags);
+		run_queue(gl);
+		spin_unlock(&gl->gl_spin);
+	}
+
+	return gh->gh_error;
+}
+
+static inline struct gfs2_holder *
+find_holder_by_owner(struct list_head *head, struct task_struct *owner)
+{
+	struct gfs2_holder *gh;
+
+	list_for_each_entry(gh, head, gh_list) {
+		if (gh->gh_owner == owner)
+			return gh;
+	}
+
+	return NULL;
+}
+
+/**
+ * add_to_queue - Add a holder to the wait queue (but look for recursion)
+ * @gh: the holder structure to add
+ *
+ */
+
+static void add_to_queue(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	struct gfs2_holder *existing;
+
+	BUG_ON(!gh->gh_owner);
+
+	existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner);
+	if (existing) {
+		print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
+		printk(KERN_INFO "pid : %d\n", existing->gh_owner->pid);
+		printk(KERN_INFO "lock type : %d lock state : %d\n",
+				existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state);
+		print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
+		printk(KERN_INFO "pid : %d\n", gh->gh_owner->pid);
+		printk(KERN_INFO "lock type : %d lock state : %d\n",
+				gl->gl_name.ln_type, gl->gl_state);
+		BUG();
+	}
+
+	existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner);
+	if (existing) {
+		print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
+		print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
+		BUG();
+	}
+
+	if (gh->gh_flags & LM_FLAG_PRIORITY)
+		list_add(&gh->gh_list, &gl->gl_waiters3);
+	else
+		list_add_tail(&gh->gh_list, &gl->gl_waiters3);
+}
+
+/**
+ * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock)
+ * @gh: the holder structure
+ *
+ * if (gh->gh_flags & GL_ASYNC), this never returns an error
+ *
+ * Returns: 0, GLR_TRYFAILED, or errno on failure
+ */
+
+int gfs2_glock_nq(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	int error = 0;
+
+restart:
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
+		set_bit(HIF_ABORTED, &gh->gh_iflags);
+		return -EIO;
+	}
+
+	set_bit(HIF_PROMOTE, &gh->gh_iflags);
+
+	spin_lock(&gl->gl_spin);
+	add_to_queue(gh);
+	run_queue(gl);
+	spin_unlock(&gl->gl_spin);
+
+	if (!(gh->gh_flags & GL_ASYNC)) {
+		error = glock_wait_internal(gh);
+		if (error == GLR_CANCELED) {
+			msleep(100);
+			goto restart;
+		}
+	}
+
+	clear_bit(GLF_PREFETCH, &gl->gl_flags);
+
+	if (error == GLR_TRYFAILED && (gh->gh_flags & GL_DUMP))
+		dump_glock(gl);
+
+	return error;
+}
+
+/**
+ * gfs2_glock_poll - poll to see if an async request has been completed
+ * @gh: the holder
+ *
+ * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on
+ */
+
+int gfs2_glock_poll(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	int ready = 0;
+
+	spin_lock(&gl->gl_spin);
+
+	if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+		ready = 1;
+	else if (list_empty(&gh->gh_list)) {
+		if (gh->gh_error == GLR_CANCELED) {
+			spin_unlock(&gl->gl_spin);
+			msleep(100);
+			if (gfs2_glock_nq(gh))
+				return 1;
+			return 0;
+		} else
+			ready = 1;
+	}
+
+	spin_unlock(&gl->gl_spin);
+
+	return ready;
+}
+
+/**
+ * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
+ * @gh: the holder structure
+ *
+ * Returns: 0, GLR_TRYFAILED, or errno on failure
+ */
+
+int gfs2_glock_wait(struct gfs2_holder *gh)
+{
+	int error;
+
+	error = glock_wait_internal(gh);
+	if (error == GLR_CANCELED) {
+		msleep(100);
+		gh->gh_flags &= ~GL_ASYNC;
+		error = gfs2_glock_nq(gh);
+	}
+
+	return error;
+}
+
+/**
+ * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
+ * @gh: the glock holder
+ *
+ */
+
+void gfs2_glock_dq(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+	if (gh->gh_flags & GL_NOCACHE)
+		handle_callback(gl, LM_ST_UNLOCKED);
+
+	gfs2_glmutex_lock(gl);
+
+	spin_lock(&gl->gl_spin);
+	list_del_init(&gh->gh_list);
+
+	if (list_empty(&gl->gl_holders)) {
+		spin_unlock(&gl->gl_spin);
+
+		if (glops->go_unlock)
+			glops->go_unlock(gh);
+
+		gl->gl_stamp = jiffies;
+
+		spin_lock(&gl->gl_spin);
+	}
+
+	clear_bit(GLF_LOCK, &gl->gl_flags);
+	run_queue(gl);
+	spin_unlock(&gl->gl_spin);
+}
+
+/**
+ * gfs2_glock_prefetch - Try to prefetch a glock
+ * @gl: the glock
+ * @state: the state to prefetch in
+ * @flags: flags passed to go_xmote_th()
+ *
+ */
+
+static void gfs2_glock_prefetch(struct gfs2_glock *gl, unsigned int state,
+				int flags)
+{
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+	spin_lock(&gl->gl_spin);
+
+	if (test_bit(GLF_LOCK, &gl->gl_flags) || !list_empty(&gl->gl_holders) ||
+	    !list_empty(&gl->gl_waiters1) || !list_empty(&gl->gl_waiters2) ||
+	    !list_empty(&gl->gl_waiters3) ||
+	    relaxed_state_ok(gl->gl_state, state, flags)) {
+		spin_unlock(&gl->gl_spin);
+		return;
+	}
+
+	set_bit(GLF_PREFETCH, &gl->gl_flags);
+	set_bit(GLF_LOCK, &gl->gl_flags);
+	spin_unlock(&gl->gl_spin);
+
+	glops->go_xmote_th(gl, state, flags);
+}
+
+static void greedy_work(void *data)
+{
+	struct greedy *gr = data;
+	struct gfs2_holder *gh = &gr->gr_gh;
+	struct gfs2_glock *gl = gh->gh_gl;
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+
+	clear_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
+
+	if (glops->go_greedy)
+		glops->go_greedy(gl);
+
+	spin_lock(&gl->gl_spin);
+
+	if (list_empty(&gl->gl_waiters2)) {
+		clear_bit(GLF_GREEDY, &gl->gl_flags);
+		spin_unlock(&gl->gl_spin);
+		gfs2_holder_uninit(gh);
+		kfree(gr);
+	} else {
+		gfs2_glock_hold(gl);
+		list_add_tail(&gh->gh_list, &gl->gl_waiters2);
+		run_queue(gl);
+		spin_unlock(&gl->gl_spin);
+		gfs2_glock_put(gl);
+	}
+}
+
+/**
+ * gfs2_glock_be_greedy -
+ * @gl:
+ * @time:
+ *
+ * Returns: 0 if go_greedy will be called, 1 otherwise
+ */
+
+int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time)
+{
+	struct greedy *gr;
+	struct gfs2_holder *gh;
+
+	if (!time || gl->gl_sbd->sd_args.ar_localcaching ||
+	    test_and_set_bit(GLF_GREEDY, &gl->gl_flags))
+		return 1;
+
+	gr = kmalloc(sizeof(struct greedy), GFP_KERNEL);
+	if (!gr) {
+		clear_bit(GLF_GREEDY, &gl->gl_flags);
+		return 1;
+	}
+	gh = &gr->gr_gh;
+
+	gfs2_holder_init(gl, 0, 0, gh);
+	set_bit(HIF_GREEDY, &gh->gh_iflags);
+	INIT_WORK(&gr->gr_work, greedy_work, gr);
+
+	set_bit(GLF_SKIP_WAITERS2, &gl->gl_flags);
+	schedule_delayed_work(&gr->gr_work, time);
+
+	return 0;
+}
+
+/**
+ * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it
+ * @gh: the holder structure
+ *
+ */
+
+void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
+{
+	gfs2_glock_dq(gh);
+	gfs2_holder_uninit(gh);
+}
+
+/**
+ * gfs2_glock_nq_num - acquire a glock based on lock number
+ * @sdp: the filesystem
+ * @number: the lock number
+ * @glops: the glock operations for the type of glock
+ * @state: the state to acquire the glock in
+ * @flags: modifier flags for the aquisition
+ * @gh: the struct gfs2_holder
+ *
+ * Returns: errno
+ */
+
+int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
+		      const struct gfs2_glock_operations *glops,
+		      unsigned int state, int flags, struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl;
+	int error;
+
+	error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
+	if (!error) {
+		error = gfs2_glock_nq_init(gl, state, flags, gh);
+		gfs2_glock_put(gl);
+	}
+
+	return error;
+}
+
+/**
+ * glock_compare - Compare two struct gfs2_glock structures for sorting
+ * @arg_a: the first structure
+ * @arg_b: the second structure
+ *
+ */
+
+static int glock_compare(const void *arg_a, const void *arg_b)
+{
+	const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a;
+	const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b;
+	const struct lm_lockname *a = &gh_a->gh_gl->gl_name;
+	const struct lm_lockname *b = &gh_b->gh_gl->gl_name;
+
+	if (a->ln_number > b->ln_number)
+		return 1;
+	if (a->ln_number < b->ln_number)
+		return -1;
+	if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE)
+		return 1;
+	if (!(gh_a->gh_flags & GL_LOCAL_EXCL) && (gh_b->gh_flags & GL_LOCAL_EXCL))
+		return 1;
+	return 0;
+}
+
+/**
+ * nq_m_sync - synchonously acquire more than one glock in deadlock free order
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ * Returns: 0 on success (all glocks acquired),
+ *          errno on failure (no glocks acquired)
+ */
+
+static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs,
+		     struct gfs2_holder **p)
+{
+	unsigned int x;
+	int error = 0;
+
+	for (x = 0; x < num_gh; x++)
+		p[x] = &ghs[x];
+
+	sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL);
+
+	for (x = 0; x < num_gh; x++) {
+		p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+
+		error = gfs2_glock_nq(p[x]);
+		if (error) {
+			while (x--)
+				gfs2_glock_dq(p[x]);
+			break;
+		}
+	}
+
+	return error;
+}
+
+/**
+ * gfs2_glock_nq_m - acquire multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ * Figure out how big an impact this function has.  Either:
+ * 1) Replace this code with code that calls gfs2_glock_prefetch()
+ * 2) Forget async stuff and just call nq_m_sync()
+ * 3) Leave it like it is
+ *
+ * Returns: 0 on success (all glocks acquired),
+ *          errno on failure (no glocks acquired)
+ */
+
+int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+	int *e;
+	unsigned int x;
+	int borked = 0, serious = 0;
+	int error = 0;
+
+	if (!num_gh)
+		return 0;
+
+	if (num_gh == 1) {
+		ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+		return gfs2_glock_nq(ghs);
+	}
+
+	e = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
+	if (!e)
+		return -ENOMEM;
+
+	for (x = 0; x < num_gh; x++) {
+		ghs[x].gh_flags |= LM_FLAG_TRY | GL_ASYNC;
+		error = gfs2_glock_nq(&ghs[x]);
+		if (error) {
+			borked = 1;
+			serious = error;
+			num_gh = x;
+			break;
+		}
+	}
+
+	for (x = 0; x < num_gh; x++) {
+		error = e[x] = glock_wait_internal(&ghs[x]);
+		if (error) {
+			borked = 1;
+			if (error != GLR_TRYFAILED && error != GLR_CANCELED)
+				serious = error;
+		}
+	}
+
+	if (!borked) {
+		kfree(e);
+		return 0;
+	}
+
+	for (x = 0; x < num_gh; x++)
+		if (!e[x])
+			gfs2_glock_dq(&ghs[x]);
+
+	if (serious)
+		error = serious;
+	else {
+		for (x = 0; x < num_gh; x++)
+			gfs2_holder_reinit(ghs[x].gh_state, ghs[x].gh_flags,
+					  &ghs[x]);
+		error = nq_m_sync(num_gh, ghs, (struct gfs2_holder **)e);
+	}
+
+	kfree(e);
+
+	return error;
+}
+
+/**
+ * gfs2_glock_dq_m - release multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ */
+
+void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+	unsigned int x;
+
+	for (x = 0; x < num_gh; x++)
+		gfs2_glock_dq(&ghs[x]);
+}
+
+/**
+ * gfs2_glock_dq_uninit_m - release multiple glocks
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ */
+
+void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+	unsigned int x;
+
+	for (x = 0; x < num_gh; x++)
+		gfs2_glock_dq_uninit(&ghs[x]);
+}
+
+/**
+ * gfs2_glock_prefetch_num - prefetch a glock based on lock number
+ * @sdp: the filesystem
+ * @number: the lock number
+ * @glops: the glock operations for the type of glock
+ * @state: the state to acquire the glock in
+ * @flags: modifier flags for the aquisition
+ *
+ * Returns: errno
+ */
+
+void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number,
+			     const struct gfs2_glock_operations *glops,
+			     unsigned int state, int flags)
+{
+	struct gfs2_glock *gl;
+	int error;
+
+	if (atomic_read(&sdp->sd_reclaim_count) <
+	    gfs2_tune_get(sdp, gt_reclaim_limit)) {
+		error = gfs2_glock_get(sdp, number, glops, CREATE, &gl);
+		if (!error) {
+			gfs2_glock_prefetch(gl, state, flags);
+			gfs2_glock_put(gl);
+		}
+	}
+}
+
+/**
+ * gfs2_lvb_hold - attach a LVB from a glock
+ * @gl: The glock in question
+ *
+ */
+
+int gfs2_lvb_hold(struct gfs2_glock *gl)
+{
+	int error;
+
+	gfs2_glmutex_lock(gl);
+
+	if (!atomic_read(&gl->gl_lvb_count)) {
+		error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
+		if (error) {
+			gfs2_glmutex_unlock(gl);
+			return error;
+		}
+		gfs2_glock_hold(gl);
+	}
+	atomic_inc(&gl->gl_lvb_count);
+
+	gfs2_glmutex_unlock(gl);
+
+	return 0;
+}
+
+/**
+ * gfs2_lvb_unhold - detach a LVB from a glock
+ * @gl: The glock in question
+ *
+ */
+
+void gfs2_lvb_unhold(struct gfs2_glock *gl)
+{
+	gfs2_glock_hold(gl);
+	gfs2_glmutex_lock(gl);
+
+	gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
+	if (atomic_dec_and_test(&gl->gl_lvb_count)) {
+		gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb);
+		gl->gl_lvb = NULL;
+		gfs2_glock_put(gl);
+	}
+
+	gfs2_glmutex_unlock(gl);
+	gfs2_glock_put(gl);
+}
+
+static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
+			unsigned int state)
+{
+	struct gfs2_glock *gl;
+
+	gl = gfs2_glock_find(sdp, name);
+	if (!gl)
+		return;
+
+	if (gl->gl_ops->go_callback)
+		gl->gl_ops->go_callback(gl, state);
+	handle_callback(gl, state);
+
+	spin_lock(&gl->gl_spin);
+	run_queue(gl);
+	spin_unlock(&gl->gl_spin);
+
+	gfs2_glock_put(gl);
+}
+
+/**
+ * gfs2_glock_cb - Callback used by locking module
+ * @sdp: Pointer to the superblock
+ * @type: Type of callback
+ * @data: Type dependent data pointer
+ *
+ * Called by the locking module when it wants to tell us something.
+ * Either we need to drop a lock, one of our ASYNC requests completed, or
+ * a journal from another client needs to be recovered.
+ */
+
+void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
+{
+	struct gfs2_sbd *sdp = cb_data;
+
+	switch (type) {
+	case LM_CB_NEED_E:
+		blocking_cb(sdp, data, LM_ST_UNLOCKED);
+		return;
+
+	case LM_CB_NEED_D:
+		blocking_cb(sdp, data, LM_ST_DEFERRED);
+		return;
+
+	case LM_CB_NEED_S:
+		blocking_cb(sdp, data, LM_ST_SHARED);
+		return;
+
+	case LM_CB_ASYNC: {
+		struct lm_async_cb *async = data;
+		struct gfs2_glock *gl;
+
+		gl = gfs2_glock_find(sdp, &async->lc_name);
+		if (gfs2_assert_warn(sdp, gl))
+			return;
+		if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
+			gl->gl_req_bh(gl, async->lc_ret);
+		gfs2_glock_put(gl);
+		return;
+	}
+
+	case LM_CB_NEED_RECOVERY:
+		gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data);
+		if (sdp->sd_recoverd_process)
+			wake_up_process(sdp->sd_recoverd_process);
+		return;
+
+	case LM_CB_DROPLOCKS:
+		gfs2_gl_hash_clear(sdp, NO_WAIT);
+		gfs2_quota_scan(sdp);
+		return;
+
+	default:
+		gfs2_assert_warn(sdp, 0);
+		return;
+	}
+}
+
+/**
+ * demote_ok - Check to see if it's ok to unlock a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+
+static int demote_ok(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	int demote = 1;
+
+	if (test_bit(GLF_STICKY, &gl->gl_flags))
+		demote = 0;
+	else if (test_bit(GLF_PREFETCH, &gl->gl_flags))
+		demote = time_after_eq(jiffies, gl->gl_stamp +
+				    gfs2_tune_get(sdp, gt_prefetch_secs) * HZ);
+	else if (glops->go_demote_ok)
+		demote = glops->go_demote_ok(gl);
+
+	return demote;
+}
+
+/**
+ * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
+ * @gl: the glock
+ *
+ */
+
+void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+
+	spin_lock(&sdp->sd_reclaim_lock);
+	if (list_empty(&gl->gl_reclaim)) {
+		gfs2_glock_hold(gl);
+		list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
+		atomic_inc(&sdp->sd_reclaim_count);
+	}
+	spin_unlock(&sdp->sd_reclaim_lock);
+
+	wake_up(&sdp->sd_reclaim_wq);
+}
+
+/**
+ * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
+ * @sdp: the filesystem
+ *
+ * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
+ * different glock and we notice that there are a lot of glocks in the
+ * reclaim list.
+ *
+ */
+
+void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
+{
+	struct gfs2_glock *gl;
+
+	spin_lock(&sdp->sd_reclaim_lock);
+	if (list_empty(&sdp->sd_reclaim_list)) {
+		spin_unlock(&sdp->sd_reclaim_lock);
+		return;
+	}
+	gl = list_entry(sdp->sd_reclaim_list.next,
+			struct gfs2_glock, gl_reclaim);
+	list_del_init(&gl->gl_reclaim);
+	spin_unlock(&sdp->sd_reclaim_lock);
+
+	atomic_dec(&sdp->sd_reclaim_count);
+	atomic_inc(&sdp->sd_reclaimed);
+
+	if (gfs2_glmutex_trylock(gl)) {
+		if (queue_empty(gl, &gl->gl_holders) &&
+		    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+			handle_callback(gl, LM_ST_UNLOCKED);
+		gfs2_glmutex_unlock(gl);
+	}
+
+	gfs2_glock_put(gl);
+}
+
+/**
+ * examine_bucket - Call a function for glock in a hash bucket
+ * @examiner: the function
+ * @sdp: the filesystem
+ * @bucket: the bucket
+ *
+ * Returns: 1 if the bucket has entries
+ */
+
+static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
+			  unsigned int hash)
+{
+	struct gfs2_glock *gl, *prev = NULL;
+	int has_entries = 0;
+	struct hlist_head *head = &gl_hash_table[hash].hb_list;
+
+	read_lock(gl_lock_addr(hash));
+	/* Can't use hlist_for_each_entry - don't want prefetch here */
+	if (hlist_empty(head))
+		goto out;
+	gl = list_entry(head->first, struct gfs2_glock, gl_list);
+	while(1) {
+		if (gl->gl_sbd == sdp) {
+			gfs2_glock_hold(gl);
+			read_unlock(gl_lock_addr(hash));
+			if (prev)
+				gfs2_glock_put(prev);
+			prev = gl;
+			examiner(gl);
+			has_entries = 1;
+			read_lock(gl_lock_addr(hash));
+		}
+		if (gl->gl_list.next == NULL)
+			break;
+		gl = list_entry(gl->gl_list.next, struct gfs2_glock, gl_list);
+	}
+out:
+	read_unlock(gl_lock_addr(hash));
+	if (prev)
+		gfs2_glock_put(prev);
+	return has_entries;
+}
+
+/**
+ * scan_glock - look at a glock and see if we can reclaim it
+ * @gl: the glock to look at
+ *
+ */
+
+static void scan_glock(struct gfs2_glock *gl)
+{
+	if (gl->gl_ops == &gfs2_inode_glops)
+		return;
+
+	if (gfs2_glmutex_trylock(gl)) {
+		if (queue_empty(gl, &gl->gl_holders) &&
+		    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+			goto out_schedule;
+		gfs2_glmutex_unlock(gl);
+	}
+	return;
+
+out_schedule:
+	gfs2_glmutex_unlock(gl);
+	gfs2_glock_schedule_for_reclaim(gl);
+}
+
+/**
+ * gfs2_scand_internal - Look for glocks and inodes to toss from memory
+ * @sdp: the filesystem
+ *
+ */
+
+void gfs2_scand_internal(struct gfs2_sbd *sdp)
+{
+	unsigned int x;
+
+	for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+		examine_bucket(scan_glock, sdp, x);
+}
+
+/**
+ * clear_glock - look at a glock and see if we can free it from glock cache
+ * @gl: the glock to look at
+ *
+ */
+
+static void clear_glock(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	int released;
+
+	spin_lock(&sdp->sd_reclaim_lock);
+	if (!list_empty(&gl->gl_reclaim)) {
+		list_del_init(&gl->gl_reclaim);
+		atomic_dec(&sdp->sd_reclaim_count);
+		spin_unlock(&sdp->sd_reclaim_lock);
+		released = gfs2_glock_put(gl);
+		gfs2_assert(sdp, !released);
+	} else {
+		spin_unlock(&sdp->sd_reclaim_lock);
+	}
+
+	if (gfs2_glmutex_trylock(gl)) {
+		if (queue_empty(gl, &gl->gl_holders) &&
+		    gl->gl_state != LM_ST_UNLOCKED)
+			handle_callback(gl, LM_ST_UNLOCKED);
+		gfs2_glmutex_unlock(gl);
+	}
+}
+
+/**
+ * gfs2_gl_hash_clear - Empty out the glock hash table
+ * @sdp: the filesystem
+ * @wait: wait until it's all gone
+ *
+ * Called when unmounting the filesystem, or when inter-node lock manager
+ * requests DROPLOCKS because it is running out of capacity.
+ */
+
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
+{
+	unsigned long t;
+	unsigned int x;
+	int cont;
+
+	t = jiffies;
+
+	for (;;) {
+		cont = 0;
+		for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
+			if (examine_bucket(clear_glock, sdp, x))
+				cont = 1;
+		}
+
+		if (!wait || !cont)
+			break;
+
+		if (time_after_eq(jiffies,
+				  t + gfs2_tune_get(sdp, gt_stall_secs) * HZ)) {
+			fs_warn(sdp, "Unmount seems to be stalled. "
+				     "Dumping lock state...\n");
+			gfs2_dump_lockstate(sdp);
+			t = jiffies;
+		}
+
+		invalidate_inodes(sdp->sd_vfs);
+		msleep(10);
+	}
+}
+
+/*
+ *  Diagnostic routines to help debug distributed deadlock
+ */
+
+/**
+ * dump_holder - print information about a glock holder
+ * @str: a string naming the type of holder
+ * @gh: the glock holder
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int dump_holder(char *str, struct gfs2_holder *gh)
+{
+	unsigned int x;
+	int error = -ENOBUFS;
+
+	printk(KERN_INFO "  %s\n", str);
+	printk(KERN_INFO "    owner = %ld\n",
+		   (gh->gh_owner) ? (long)gh->gh_owner->pid : -1);
+	printk(KERN_INFO "    gh_state = %u\n", gh->gh_state);
+	printk(KERN_INFO "    gh_flags =");
+	for (x = 0; x < 32; x++)
+		if (gh->gh_flags & (1 << x))
+			printk(" %u", x);
+	printk(" \n");
+	printk(KERN_INFO "    error = %d\n", gh->gh_error);
+	printk(KERN_INFO "    gh_iflags =");
+	for (x = 0; x < 32; x++)
+		if (test_bit(x, &gh->gh_iflags))
+			printk(" %u", x);
+	printk(" \n");
+	print_symbol(KERN_INFO "    initialized at: %s\n", gh->gh_ip);
+
+	error = 0;
+
+	return error;
+}
+
+/**
+ * dump_inode - print information about an inode
+ * @ip: the inode
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int dump_inode(struct gfs2_inode *ip)
+{
+	unsigned int x;
+	int error = -ENOBUFS;
+
+	printk(KERN_INFO "  Inode:\n");
+	printk(KERN_INFO "    num = %llu %llu\n",
+		    (unsigned long long)ip->i_num.no_formal_ino,
+		    (unsigned long long)ip->i_num.no_addr);
+	printk(KERN_INFO "    type = %u\n", IF2DT(ip->i_di.di_mode));
+	printk(KERN_INFO "    i_flags =");
+	for (x = 0; x < 32; x++)
+		if (test_bit(x, &ip->i_flags))
+			printk(" %u", x);
+	printk(" \n");
+
+	error = 0;
+
+	return error;
+}
+
+/**
+ * dump_glock - print information about a glock
+ * @gl: the glock
+ * @count: where we are in the buffer
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int dump_glock(struct gfs2_glock *gl)
+{
+	struct gfs2_holder *gh;
+	unsigned int x;
+	int error = -ENOBUFS;
+
+	spin_lock(&gl->gl_spin);
+
+	printk(KERN_INFO "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type,
+	       (unsigned long long)gl->gl_name.ln_number);
+	printk(KERN_INFO "  gl_flags =");
+	for (x = 0; x < 32; x++) {
+		if (test_bit(x, &gl->gl_flags))
+			printk(" %u", x);
+	}
+	printk(" \n");
+	printk(KERN_INFO "  gl_ref = %d\n", atomic_read(&gl->gl_ref));
+	printk(KERN_INFO "  gl_state = %u\n", gl->gl_state);
+	printk(KERN_INFO "  gl_owner = %s\n", gl->gl_owner->comm);
+	print_symbol(KERN_INFO "  gl_ip = %s\n", gl->gl_ip);
+	printk(KERN_INFO "  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
+	printk(KERN_INFO "  req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
+	printk(KERN_INFO "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
+	printk(KERN_INFO "  object = %s\n", (gl->gl_object) ? "yes" : "no");
+	printk(KERN_INFO "  le = %s\n",
+		   (list_empty(&gl->gl_le.le_list)) ? "no" : "yes");
+	printk(KERN_INFO "  reclaim = %s\n",
+		    (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
+	if (gl->gl_aspace)
+		printk(KERN_INFO "  aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
+		       gl->gl_aspace->i_mapping->nrpages);
+	else
+		printk(KERN_INFO "  aspace = no\n");
+	printk(KERN_INFO "  ail = %d\n", atomic_read(&gl->gl_ail_count));
+	if (gl->gl_req_gh) {
+		error = dump_holder("Request", gl->gl_req_gh);
+		if (error)
+			goto out;
+	}
+	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+		error = dump_holder("Holder", gh);
+		if (error)
+			goto out;
+	}
+	list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
+		error = dump_holder("Waiter1", gh);
+		if (error)
+			goto out;
+	}
+	list_for_each_entry(gh, &gl->gl_waiters2, gh_list) {
+		error = dump_holder("Waiter2", gh);
+		if (error)
+			goto out;
+	}
+	list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
+		error = dump_holder("Waiter3", gh);
+		if (error)
+			goto out;
+	}
+	if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
+		if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
+		    list_empty(&gl->gl_holders)) {
+			error = dump_inode(gl->gl_object);
+			if (error)
+				goto out;
+		} else {
+			error = -ENOBUFS;
+			printk(KERN_INFO "  Inode: busy\n");
+		}
+	}
+
+	error = 0;
+
+out:
+	spin_unlock(&gl->gl_spin);
+	return error;
+}
+
+/**
+ * gfs2_dump_lockstate - print out the current lockstate
+ * @sdp: the filesystem
+ * @ub: the buffer to copy the information into
+ *
+ * If @ub is NULL, dump the lockstate to the console.
+ *
+ */
+
+static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
+{
+	struct gfs2_glock *gl;
+	struct hlist_node *h;
+	unsigned int x;
+	int error = 0;
+
+	for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
+
+		read_lock(gl_lock_addr(x));
+
+		hlist_for_each_entry(gl, h, &gl_hash_table[x].hb_list, gl_list) {
+			if (gl->gl_sbd != sdp)
+				continue;
+
+			error = dump_glock(gl);
+			if (error)
+				break;
+		}
+
+		read_unlock(gl_lock_addr(x));
+
+		if (error)
+			break;
+	}
+
+
+	return error;
+}
+
+int __init gfs2_glock_init(void)
+{
+	unsigned i;
+	for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
+		INIT_HLIST_HEAD(&gl_hash_table[i].hb_list);
+	}
+#ifdef GL_HASH_LOCK_SZ
+	for(i = 0; i < GL_HASH_LOCK_SZ; i++) {
+		rwlock_init(&gl_hash_locks[i]);
+	}
+#endif
+	return 0;
+}
+
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
new file mode 100644
index 000000000000..2b2a889ee2cc
--- /dev/null
+++ b/fs/gfs2/glock.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __GLOCK_DOT_H__
+#define __GLOCK_DOT_H__
+
+#include "incore.h"
+
+/* Flags for lock requests; used in gfs2_holder gh_flag field.
+   From lm_interface.h:
+#define LM_FLAG_TRY		0x00000001
+#define LM_FLAG_TRY_1CB		0x00000002
+#define LM_FLAG_NOEXP		0x00000004
+#define LM_FLAG_ANY		0x00000008
+#define LM_FLAG_PRIORITY	0x00000010 */
+
+#define GL_LOCAL_EXCL		0x00000020
+#define GL_ASYNC		0x00000040
+#define GL_EXACT		0x00000080
+#define GL_SKIP			0x00000100
+#define GL_ATIME		0x00000200
+#define GL_NOCACHE		0x00000400
+#define GL_NOCANCEL		0x00001000
+#define GL_AOP			0x00004000
+#define GL_DUMP			0x00008000
+
+#define GLR_TRYFAILED		13
+#define GLR_CANCELED		14
+
+static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
+{
+	struct gfs2_holder *gh;
+	int locked = 0;
+
+	/* Look in glock's list of holders for one with current task as owner */
+	spin_lock(&gl->gl_spin);
+	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+		if (gh->gh_owner == current) {
+			locked = 1;
+			break;
+		}
+	}
+	spin_unlock(&gl->gl_spin);
+
+	return locked;
+}
+
+static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
+{
+	return gl->gl_state == LM_ST_EXCLUSIVE;
+}
+
+static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl)
+{
+	return gl->gl_state == LM_ST_DEFERRED;
+}
+
+static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl)
+{
+	return gl->gl_state == LM_ST_SHARED;
+}
+
+static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
+{
+	int ret;
+	spin_lock(&gl->gl_spin);
+	ret = !list_empty(&gl->gl_waiters2) || !list_empty(&gl->gl_waiters3);
+	spin_unlock(&gl->gl_spin);
+	return ret;
+}
+
+int gfs2_glock_get(struct gfs2_sbd *sdp,
+		   u64 number, const struct gfs2_glock_operations *glops,
+		   int create, struct gfs2_glock **glp);
+void gfs2_glock_hold(struct gfs2_glock *gl);
+int gfs2_glock_put(struct gfs2_glock *gl);
+void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
+		      struct gfs2_holder *gh);
+void gfs2_holder_reinit(unsigned int state, unsigned flags,
+			struct gfs2_holder *gh);
+void gfs2_holder_uninit(struct gfs2_holder *gh);
+
+void gfs2_glock_xmote_th(struct gfs2_glock *gl, unsigned int state, int flags);
+void gfs2_glock_drop_th(struct gfs2_glock *gl);
+
+int gfs2_glock_nq(struct gfs2_holder *gh);
+int gfs2_glock_poll(struct gfs2_holder *gh);
+int gfs2_glock_wait(struct gfs2_holder *gh);
+void gfs2_glock_dq(struct gfs2_holder *gh);
+
+int gfs2_glock_be_greedy(struct gfs2_glock *gl, unsigned int time);
+
+void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
+int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
+		      u64 number, const struct gfs2_glock_operations *glops,
+		      unsigned int state, int flags, struct gfs2_holder *gh);
+
+int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+
+void gfs2_glock_prefetch_num(struct gfs2_sbd *sdp, u64 number,
+			     const struct gfs2_glock_operations *glops,
+			     unsigned int state, int flags);
+void gfs2_glock_inode_squish(struct inode *inode);
+
+/**
+ * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
+ * @gl: the glock
+ * @state: the state we're requesting
+ * @flags: the modifier flags
+ * @gh: the holder structure
+ *
+ * Returns: 0, GLR_*, or errno
+ */
+
+static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
+				     unsigned int state, int flags,
+				     struct gfs2_holder *gh)
+{
+	int error;
+
+	gfs2_holder_init(gl, state, flags, gh);
+
+	error = gfs2_glock_nq(gh);
+	if (error)
+		gfs2_holder_uninit(gh);
+
+	return error;
+}
+
+/*  Lock Value Block functions  */
+
+int gfs2_lvb_hold(struct gfs2_glock *gl);
+void gfs2_lvb_unhold(struct gfs2_glock *gl);
+
+void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
+
+void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
+void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
+
+void gfs2_scand_internal(struct gfs2_sbd *sdp);
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
+
+int __init gfs2_glock_init(void);
+
+#endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
new file mode 100644
index 000000000000..41a6b6818a50
--- /dev/null
+++ b/fs/gfs2/glops.c
@@ -0,0 +1,615 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "util.h"
+#include "trans.h"
+
+/**
+ * ail_empty_gl - remove all buffers for a given lock from the AIL
+ * @gl: the glock
+ *
+ * None of the buffers should be dirty, locked, or pinned.
+ */
+
+static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	unsigned int blocks;
+	struct list_head *head = &gl->gl_ail_list;
+	struct gfs2_bufdata *bd;
+	struct buffer_head *bh;
+	u64 blkno;
+	int error;
+
+	blocks = atomic_read(&gl->gl_ail_count);
+	if (!blocks)
+		return;
+
+	error = gfs2_trans_begin(sdp, 0, blocks);
+	if (gfs2_assert_withdraw(sdp, !error))
+		return;
+
+	gfs2_log_lock(sdp);
+	while (!list_empty(head)) {
+		bd = list_entry(head->next, struct gfs2_bufdata,
+				bd_ail_gl_list);
+		bh = bd->bd_bh;
+		blkno = bh->b_blocknr;
+		gfs2_assert_withdraw(sdp, !buffer_busy(bh));
+
+		bd->bd_ail = NULL;
+		list_del(&bd->bd_ail_st_list);
+		list_del(&bd->bd_ail_gl_list);
+		atomic_dec(&gl->gl_ail_count);
+		brelse(bh);
+		gfs2_log_unlock(sdp);
+
+		gfs2_trans_add_revoke(sdp, blkno);
+
+		gfs2_log_lock(sdp);
+	}
+	gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
+	gfs2_log_unlock(sdp);
+
+	gfs2_trans_end(sdp);
+	gfs2_log_flush(sdp, NULL);
+}
+
+/**
+ * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock
+ * @gl: the glock
+ *
+ */
+
+static void gfs2_pte_inval(struct gfs2_glock *gl)
+{
+	struct gfs2_inode *ip;
+	struct inode *inode;
+
+	ip = gl->gl_object;
+	inode = &ip->i_inode;
+	if (!ip || !S_ISREG(ip->i_di.di_mode))
+		return;
+
+	if (!test_bit(GIF_PAGED, &ip->i_flags))
+		return;
+
+	unmap_shared_mapping_range(inode->i_mapping, 0, 0);
+
+	if (test_bit(GIF_SW_PAGED, &ip->i_flags))
+		set_bit(GLF_DIRTY, &gl->gl_flags);
+
+	clear_bit(GIF_SW_PAGED, &ip->i_flags);
+}
+
+/**
+ * gfs2_page_inval - Invalidate all pages associated with a glock
+ * @gl: the glock
+ *
+ */
+
+static void gfs2_page_inval(struct gfs2_glock *gl)
+{
+	struct gfs2_inode *ip;
+	struct inode *inode;
+
+	ip = gl->gl_object;
+	inode = &ip->i_inode;
+	if (!ip || !S_ISREG(ip->i_di.di_mode))
+		return;
+
+	truncate_inode_pages(inode->i_mapping, 0);
+	gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), !inode->i_mapping->nrpages);
+	clear_bit(GIF_PAGED, &ip->i_flags);
+}
+
+/**
+ * gfs2_page_wait - Wait for writeback of data
+ * @gl: the glock
+ *
+ * Syncs data (not metadata) for a regular file.
+ * No-op for all other types.
+ */
+
+static void gfs2_page_wait(struct gfs2_glock *gl)
+{
+	struct gfs2_inode *ip = gl->gl_object;
+	struct inode *inode = &ip->i_inode;
+	struct address_space *mapping = inode->i_mapping;
+	int error;
+
+	if (!S_ISREG(ip->i_di.di_mode))
+		return;
+
+	error = filemap_fdatawait(mapping);
+
+	/* Put back any errors cleared by filemap_fdatawait()
+	   so they can be caught by someone who can pass them
+	   up to user space. */
+
+	if (error == -ENOSPC)
+		set_bit(AS_ENOSPC, &mapping->flags);
+	else if (error)
+		set_bit(AS_EIO, &mapping->flags);
+
+}
+
+static void gfs2_page_writeback(struct gfs2_glock *gl)
+{
+	struct gfs2_inode *ip = gl->gl_object;
+	struct inode *inode = &ip->i_inode;
+	struct address_space *mapping = inode->i_mapping;
+
+	if (!S_ISREG(ip->i_di.di_mode))
+		return;
+
+	filemap_fdatawrite(mapping);
+}
+
+/**
+ * meta_go_sync - sync out the metadata for this glock
+ * @gl: the glock
+ * @flags: DIO_*
+ *
+ * Called when demoting or unlocking an EX glock.  We must flush
+ * to disk all dirty buffers/pages relating to this glock, and must not
+ * not return to caller to demote/unlock the glock until I/O is complete.
+ */
+
+static void meta_go_sync(struct gfs2_glock *gl, int flags)
+{
+	if (!(flags & DIO_METADATA))
+		return;
+
+	if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) {
+		gfs2_log_flush(gl->gl_sbd, gl);
+		gfs2_meta_sync(gl);
+		if (flags & DIO_RELEASE)
+			gfs2_ail_empty_gl(gl);
+	}
+
+}
+
+/**
+ * meta_go_inval - invalidate the metadata for this glock
+ * @gl: the glock
+ * @flags:
+ *
+ */
+
+static void meta_go_inval(struct gfs2_glock *gl, int flags)
+{
+	if (!(flags & DIO_METADATA))
+		return;
+
+	gfs2_meta_inval(gl);
+	gl->gl_vn++;
+}
+
+/**
+ * inode_go_xmote_th - promote/demote a glock
+ * @gl: the glock
+ * @state: the requested state
+ * @flags:
+ *
+ */
+
+static void inode_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
+			      int flags)
+{
+	if (gl->gl_state != LM_ST_UNLOCKED)
+		gfs2_pte_inval(gl);
+	gfs2_glock_xmote_th(gl, state, flags);
+}
+
+/**
+ * inode_go_xmote_bh - After promoting/demoting a glock
+ * @gl: the glock
+ *
+ */
+
+static void inode_go_xmote_bh(struct gfs2_glock *gl)
+{
+	struct gfs2_holder *gh = gl->gl_req_gh;
+	struct buffer_head *bh;
+	int error;
+
+	if (gl->gl_state != LM_ST_UNLOCKED &&
+	    (!gh || !(gh->gh_flags & GL_SKIP))) {
+		error = gfs2_meta_read(gl, gl->gl_name.ln_number, 0, &bh);
+		if (!error)
+			brelse(bh);
+	}
+}
+
+/**
+ * inode_go_drop_th - unlock a glock
+ * @gl: the glock
+ *
+ * Invoked from rq_demote().
+ * Another node needs the lock in EXCLUSIVE mode, or lock (unused for too long)
+ * is being purged from our node's glock cache; we're dropping lock.
+ */
+
+static void inode_go_drop_th(struct gfs2_glock *gl)
+{
+	gfs2_pte_inval(gl);
+	gfs2_glock_drop_th(gl);
+}
+
+/**
+ * inode_go_sync - Sync the dirty data and/or metadata for an inode glock
+ * @gl: the glock protecting the inode
+ * @flags:
+ *
+ */
+
+static void inode_go_sync(struct gfs2_glock *gl, int flags)
+{
+	int meta = (flags & DIO_METADATA);
+	int data = (flags & DIO_DATA);
+
+	if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
+		if (meta && data) {
+			gfs2_page_writeback(gl);
+			gfs2_log_flush(gl->gl_sbd, gl);
+			gfs2_meta_sync(gl);
+			gfs2_page_wait(gl);
+			clear_bit(GLF_DIRTY, &gl->gl_flags);
+		} else if (meta) {
+			gfs2_log_flush(gl->gl_sbd, gl);
+			gfs2_meta_sync(gl);
+		} else if (data) {
+			gfs2_page_writeback(gl);
+			gfs2_page_wait(gl);
+		}
+		if (flags & DIO_RELEASE)
+			gfs2_ail_empty_gl(gl);
+	}
+}
+
+/**
+ * inode_go_inval - prepare a inode glock to be released
+ * @gl: the glock
+ * @flags:
+ *
+ */
+
+static void inode_go_inval(struct gfs2_glock *gl, int flags)
+{
+	int meta = (flags & DIO_METADATA);
+	int data = (flags & DIO_DATA);
+
+	if (meta) {
+		gfs2_meta_inval(gl);
+		gl->gl_vn++;
+	}
+	if (data)
+		gfs2_page_inval(gl);
+}
+
+/**
+ * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+
+static int inode_go_demote_ok(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	int demote = 0;
+
+	if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages)
+		demote = 1;
+	else if (!sdp->sd_args.ar_localcaching &&
+		 time_after_eq(jiffies, gl->gl_stamp +
+			       gfs2_tune_get(sdp, gt_demote_secs) * HZ))
+		demote = 1;
+
+	return demote;
+}
+
+/**
+ * inode_go_lock - operation done after an inode lock is locked by a process
+ * @gl: the glock
+ * @flags:
+ *
+ * Returns: errno
+ */
+
+static int inode_go_lock(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	struct gfs2_inode *ip = gl->gl_object;
+	int error = 0;
+
+	if (!ip)
+		return 0;
+
+	if (ip->i_vn != gl->gl_vn) {
+		error = gfs2_inode_refresh(ip);
+		if (error)
+			return error;
+		gfs2_inode_attr_in(ip);
+	}
+
+	if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
+	    (gl->gl_state == LM_ST_EXCLUSIVE) &&
+	    (gh->gh_flags & GL_LOCAL_EXCL))
+		error = gfs2_truncatei_resume(ip);
+
+	return error;
+}
+
+/**
+ * inode_go_unlock - operation done before an inode lock is unlocked by a
+ *		     process
+ * @gl: the glock
+ * @flags:
+ *
+ */
+
+static void inode_go_unlock(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	struct gfs2_inode *ip = gl->gl_object;
+
+	if (ip == NULL)
+		return;
+	if (test_bit(GLF_DIRTY, &gl->gl_flags))
+		gfs2_inode_attr_in(ip);
+	gfs2_meta_cache_flush(ip);
+}
+
+/**
+ * inode_greedy -
+ * @gl: the glock
+ *
+ */
+
+static void inode_greedy(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_inode *ip = gl->gl_object;
+	unsigned int quantum = gfs2_tune_get(sdp, gt_greedy_quantum);
+	unsigned int max = gfs2_tune_get(sdp, gt_greedy_max);
+	unsigned int new_time;
+
+	spin_lock(&ip->i_spin);
+
+	if (time_after(ip->i_last_pfault + quantum, jiffies)) {
+		new_time = ip->i_greedy + quantum;
+		if (new_time > max)
+			new_time = max;
+	} else {
+		new_time = ip->i_greedy - quantum;
+		if (!new_time || new_time > max)
+			new_time = 1;
+	}
+
+	ip->i_greedy = new_time;
+
+	spin_unlock(&ip->i_spin);
+
+	iput(&ip->i_inode);
+}
+
+/**
+ * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+
+static int rgrp_go_demote_ok(struct gfs2_glock *gl)
+{
+	return !gl->gl_aspace->i_mapping->nrpages;
+}
+
+/**
+ * rgrp_go_lock - operation done after an rgrp lock is locked by
+ *    a first holder on this node.
+ * @gl: the glock
+ * @flags:
+ *
+ * Returns: errno
+ */
+
+static int rgrp_go_lock(struct gfs2_holder *gh)
+{
+	return gfs2_rgrp_bh_get(gh->gh_gl->gl_object);
+}
+
+/**
+ * rgrp_go_unlock - operation done before an rgrp lock is unlocked by
+ *    a last holder on this node.
+ * @gl: the glock
+ * @flags:
+ *
+ */
+
+static void rgrp_go_unlock(struct gfs2_holder *gh)
+{
+	gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
+}
+
+/**
+ * trans_go_xmote_th - promote/demote the transaction glock
+ * @gl: the glock
+ * @state: the requested state
+ * @flags:
+ *
+ */
+
+static void trans_go_xmote_th(struct gfs2_glock *gl, unsigned int state,
+			      int flags)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+
+	if (gl->gl_state != LM_ST_UNLOCKED &&
+	    test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+		gfs2_meta_syncfs(sdp);
+		gfs2_log_shutdown(sdp);
+	}
+
+	gfs2_glock_xmote_th(gl, state, flags);
+}
+
+/**
+ * trans_go_xmote_bh - After promoting/demoting the transaction glock
+ * @gl: the glock
+ *
+ */
+
+static void trans_go_xmote_bh(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+	struct gfs2_glock *j_gl = ip->i_gl;
+	struct gfs2_log_header head;
+	int error;
+
+	if (gl->gl_state != LM_ST_UNLOCKED &&
+	    test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+		gfs2_meta_cache_flush(GFS2_I(sdp->sd_jdesc->jd_inode));
+		j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
+
+		error = gfs2_find_jhead(sdp->sd_jdesc, &head);
+		if (error)
+			gfs2_consist(sdp);
+		if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT))
+			gfs2_consist(sdp);
+
+		/*  Initialize some head of the log stuff  */
+		if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) {
+			sdp->sd_log_sequence = head.lh_sequence + 1;
+			gfs2_log_pointers_init(sdp, head.lh_blkno);
+		}
+	}
+}
+
+/**
+ * trans_go_drop_th - unlock the transaction glock
+ * @gl: the glock
+ *
+ * We want to sync the device even with localcaching.  Remember
+ * that localcaching journal replay only marks buffers dirty.
+ */
+
+static void trans_go_drop_th(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+
+	if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+		gfs2_meta_syncfs(sdp);
+		gfs2_log_shutdown(sdp);
+	}
+
+	gfs2_glock_drop_th(gl);
+}
+
+/**
+ * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+
+static int quota_go_demote_ok(struct gfs2_glock *gl)
+{
+	return !atomic_read(&gl->gl_lvb_count);
+}
+
+const struct gfs2_glock_operations gfs2_meta_glops = {
+	.go_xmote_th = gfs2_glock_xmote_th,
+	.go_drop_th = gfs2_glock_drop_th,
+	.go_type = LM_TYPE_META,
+};
+
+const struct gfs2_glock_operations gfs2_inode_glops = {
+	.go_xmote_th = inode_go_xmote_th,
+	.go_xmote_bh = inode_go_xmote_bh,
+	.go_drop_th = inode_go_drop_th,
+	.go_sync = inode_go_sync,
+	.go_inval = inode_go_inval,
+	.go_demote_ok = inode_go_demote_ok,
+	.go_lock = inode_go_lock,
+	.go_unlock = inode_go_unlock,
+	.go_greedy = inode_greedy,
+	.go_type = LM_TYPE_INODE,
+};
+
+const struct gfs2_glock_operations gfs2_rgrp_glops = {
+	.go_xmote_th = gfs2_glock_xmote_th,
+	.go_drop_th = gfs2_glock_drop_th,
+	.go_sync = meta_go_sync,
+	.go_inval = meta_go_inval,
+	.go_demote_ok = rgrp_go_demote_ok,
+	.go_lock = rgrp_go_lock,
+	.go_unlock = rgrp_go_unlock,
+	.go_type = LM_TYPE_RGRP,
+};
+
+const struct gfs2_glock_operations gfs2_trans_glops = {
+	.go_xmote_th = trans_go_xmote_th,
+	.go_xmote_bh = trans_go_xmote_bh,
+	.go_drop_th = trans_go_drop_th,
+	.go_type = LM_TYPE_NONDISK,
+};
+
+const struct gfs2_glock_operations gfs2_iopen_glops = {
+	.go_xmote_th = gfs2_glock_xmote_th,
+	.go_drop_th = gfs2_glock_drop_th,
+	.go_type = LM_TYPE_IOPEN,
+};
+
+const struct gfs2_glock_operations gfs2_flock_glops = {
+	.go_xmote_th = gfs2_glock_xmote_th,
+	.go_drop_th = gfs2_glock_drop_th,
+	.go_type = LM_TYPE_FLOCK,
+};
+
+const struct gfs2_glock_operations gfs2_nondisk_glops = {
+	.go_xmote_th = gfs2_glock_xmote_th,
+	.go_drop_th = gfs2_glock_drop_th,
+	.go_type = LM_TYPE_NONDISK,
+};
+
+const struct gfs2_glock_operations gfs2_quota_glops = {
+	.go_xmote_th = gfs2_glock_xmote_th,
+	.go_drop_th = gfs2_glock_drop_th,
+	.go_demote_ok = quota_go_demote_ok,
+	.go_type = LM_TYPE_QUOTA,
+};
+
+const struct gfs2_glock_operations gfs2_journal_glops = {
+	.go_xmote_th = gfs2_glock_xmote_th,
+	.go_drop_th = gfs2_glock_drop_th,
+	.go_type = LM_TYPE_JOURNAL,
+};
+
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
new file mode 100644
index 000000000000..a1d9b5b024e6
--- /dev/null
+++ b/fs/gfs2/glops.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __GLOPS_DOT_H__
+#define __GLOPS_DOT_H__
+
+#include "incore.h"
+
+extern const struct gfs2_glock_operations gfs2_meta_glops;
+extern const struct gfs2_glock_operations gfs2_inode_glops;
+extern const struct gfs2_glock_operations gfs2_rgrp_glops;
+extern const struct gfs2_glock_operations gfs2_trans_glops;
+extern const struct gfs2_glock_operations gfs2_iopen_glops;
+extern const struct gfs2_glock_operations gfs2_flock_glops;
+extern const struct gfs2_glock_operations gfs2_nondisk_glops;
+extern const struct gfs2_glock_operations gfs2_quota_glops;
+extern const struct gfs2_glock_operations gfs2_journal_glops;
+
+#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
new file mode 100644
index 000000000000..118dc693d111
--- /dev/null
+++ b/fs/gfs2/incore.h
@@ -0,0 +1,634 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __INCORE_DOT_H__
+#define __INCORE_DOT_H__
+
+#include <linux/fs.h>
+
+#define DIO_WAIT	0x00000010
+#define DIO_METADATA	0x00000020
+#define DIO_DATA	0x00000040
+#define DIO_RELEASE	0x00000080
+#define DIO_ALL		0x00000100
+
+struct gfs2_log_operations;
+struct gfs2_log_element;
+struct gfs2_holder;
+struct gfs2_glock;
+struct gfs2_quota_data;
+struct gfs2_trans;
+struct gfs2_ail;
+struct gfs2_jdesc;
+struct gfs2_sbd;
+
+typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
+
+/*
+ * Structure of operations that are associated with each
+ * type of element in the log.
+ */
+
+struct gfs2_log_operations {
+	void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le);
+	void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
+	void (*lo_before_commit) (struct gfs2_sbd *sdp);
+	void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
+	void (*lo_before_scan) (struct gfs2_jdesc *jd,
+				struct gfs2_log_header *head, int pass);
+	int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start,
+				 struct gfs2_log_descriptor *ld, __be64 *ptr,
+				 int pass);
+	void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass);
+	const char *lo_name;
+};
+
+struct gfs2_log_element {
+	struct list_head le_list;
+	const struct gfs2_log_operations *le_ops;
+};
+
+struct gfs2_bitmap {
+	struct buffer_head *bi_bh;
+	char *bi_clone;
+	u32 bi_offset;
+	u32 bi_start;
+	u32 bi_len;
+};
+
+struct gfs2_rgrpd {
+	struct list_head rd_list;	/* Link with superblock */
+	struct list_head rd_list_mru;
+	struct list_head rd_recent;	/* Recently used rgrps */
+	struct gfs2_glock *rd_gl;	/* Glock for this rgrp */
+	struct gfs2_rindex rd_ri;
+	struct gfs2_rgrp rd_rg;
+	u64 rd_rg_vn;
+	struct gfs2_bitmap *rd_bits;
+	unsigned int rd_bh_count;
+	struct mutex rd_mutex;
+	u32 rd_free_clone;
+	struct gfs2_log_element rd_le;
+	u32 rd_last_alloc_data;
+	u32 rd_last_alloc_meta;
+	struct gfs2_sbd *rd_sbd;
+};
+
+enum gfs2_state_bits {
+	BH_Pinned = BH_PrivateStart,
+	BH_Escaped = BH_PrivateStart + 1,
+};
+
+BUFFER_FNS(Pinned, pinned)
+TAS_BUFFER_FNS(Pinned, pinned)
+BUFFER_FNS(Escaped, escaped)
+TAS_BUFFER_FNS(Escaped, escaped)
+
+struct gfs2_bufdata {
+	struct buffer_head *bd_bh;
+	struct gfs2_glock *bd_gl;
+
+	struct list_head bd_list_tr;
+	struct gfs2_log_element bd_le;
+
+	struct gfs2_ail *bd_ail;
+	struct list_head bd_ail_st_list;
+	struct list_head bd_ail_gl_list;
+};
+
+struct gfs2_glock_operations {
+	void (*go_xmote_th) (struct gfs2_glock * gl, unsigned int state,
+			     int flags);
+	void (*go_xmote_bh) (struct gfs2_glock * gl);
+	void (*go_drop_th) (struct gfs2_glock * gl);
+	void (*go_drop_bh) (struct gfs2_glock * gl);
+	void (*go_sync) (struct gfs2_glock * gl, int flags);
+	void (*go_inval) (struct gfs2_glock * gl, int flags);
+	int (*go_demote_ok) (struct gfs2_glock * gl);
+	int (*go_lock) (struct gfs2_holder * gh);
+	void (*go_unlock) (struct gfs2_holder * gh);
+	void (*go_callback) (struct gfs2_glock * gl, unsigned int state);
+	void (*go_greedy) (struct gfs2_glock * gl);
+	const int go_type;
+};
+
+enum {
+	/* Actions */
+	HIF_MUTEX		= 0,
+	HIF_PROMOTE		= 1,
+	HIF_DEMOTE		= 2,
+	HIF_GREEDY		= 3,
+
+	/* States */
+	HIF_ALLOCED		= 4,
+	HIF_DEALLOC		= 5,
+	HIF_HOLDER		= 6,
+	HIF_FIRST		= 7,
+	HIF_ABORTED		= 9,
+};
+
+struct gfs2_holder {
+	struct list_head gh_list;
+
+	struct gfs2_glock *gh_gl;
+	struct task_struct *gh_owner;
+	unsigned int gh_state;
+	unsigned gh_flags;
+
+	int gh_error;
+	unsigned long gh_iflags;
+	struct completion gh_wait;
+	unsigned long gh_ip;
+};
+
+enum {
+	GLF_LOCK		= 1,
+	GLF_STICKY		= 2,
+	GLF_PREFETCH		= 3,
+	GLF_DIRTY		= 5,
+	GLF_SKIP_WAITERS2	= 6,
+	GLF_GREEDY		= 7,
+};
+
+struct gfs2_glock {
+	struct hlist_node gl_list;
+	unsigned long gl_flags;		/* GLF_... */
+	struct lm_lockname gl_name;
+	atomic_t gl_ref;
+
+	spinlock_t gl_spin;
+
+	unsigned int gl_state;
+	unsigned int gl_hash;
+	struct task_struct *gl_owner;
+	unsigned long gl_ip;
+	struct list_head gl_holders;
+	struct list_head gl_waiters1;	/* HIF_MUTEX */
+	struct list_head gl_waiters2;	/* HIF_DEMOTE, HIF_GREEDY */
+	struct list_head gl_waiters3;	/* HIF_PROMOTE */
+
+	const struct gfs2_glock_operations *gl_ops;
+
+	struct gfs2_holder *gl_req_gh;
+	gfs2_glop_bh_t gl_req_bh;
+
+	void *gl_lock;
+	char *gl_lvb;
+	atomic_t gl_lvb_count;
+
+	u64 gl_vn;
+	unsigned long gl_stamp;
+	void *gl_object;
+
+	struct list_head gl_reclaim;
+
+	struct gfs2_sbd *gl_sbd;
+
+	struct inode *gl_aspace;
+	struct gfs2_log_element gl_le;
+	struct list_head gl_ail_list;
+	atomic_t gl_ail_count;
+};
+
+struct gfs2_alloc {
+	/* Quota stuff */
+
+	struct gfs2_quota_data *al_qd[2*MAXQUOTAS];
+	struct gfs2_holder al_qd_ghs[2*MAXQUOTAS];
+	unsigned int al_qd_num;
+
+	u32 al_requested; /* Filled in by caller of gfs2_inplace_reserve() */
+	u32 al_alloced; /* Filled in by gfs2_alloc_*() */
+
+	/* Filled in by gfs2_inplace_reserve() */
+
+	unsigned int al_line;
+	char *al_file;
+	struct gfs2_holder al_ri_gh;
+	struct gfs2_holder al_rgd_gh;
+	struct gfs2_rgrpd *al_rgd;
+
+};
+
+enum {
+	GIF_QD_LOCKED		= 1,
+	GIF_PAGED		= 2,
+	GIF_SW_PAGED		= 3,
+};
+
+struct gfs2_inode {
+	struct inode i_inode;
+	struct gfs2_inum i_num;
+
+	unsigned long i_flags;		/* GIF_... */
+
+	u64 i_vn;
+	struct gfs2_dinode i_di; /* To be replaced by ref to block */
+
+	struct gfs2_glock *i_gl; /* Move into i_gh? */
+	struct gfs2_holder i_iopen_gh;
+	struct gfs2_holder i_gh; /* for prepare/commit_write only */
+	struct gfs2_alloc i_alloc;
+	u64 i_last_rg_alloc;
+
+	spinlock_t i_spin;
+	struct rw_semaphore i_rw_mutex;
+	unsigned int i_greedy;
+	unsigned long i_last_pfault;
+
+	struct buffer_head *i_cache[GFS2_MAX_META_HEIGHT];
+};
+
+/*
+ * Since i_inode is the first element of struct gfs2_inode,
+ * this is effectively a cast.
+ */
+static inline struct gfs2_inode *GFS2_I(struct inode *inode)
+{
+	return container_of(inode, struct gfs2_inode, i_inode);
+}
+
+/* To be removed? */
+static inline struct gfs2_sbd *GFS2_SB(struct inode *inode)
+{
+	return inode->i_sb->s_fs_info;
+}
+
+enum {
+	GFF_DID_DIRECT_ALLOC	= 0,
+	GFF_EXLOCK = 1,
+};
+
+struct gfs2_file {
+	unsigned long f_flags;		/* GFF_... */
+	struct mutex f_fl_mutex;
+	struct gfs2_holder f_fl_gh;
+};
+
+struct gfs2_revoke {
+	struct gfs2_log_element rv_le;
+	u64 rv_blkno;
+};
+
+struct gfs2_revoke_replay {
+	struct list_head rr_list;
+	u64 rr_blkno;
+	unsigned int rr_where;
+};
+
+enum {
+	QDF_USER		= 0,
+	QDF_CHANGE		= 1,
+	QDF_LOCKED		= 2,
+};
+
+struct gfs2_quota_lvb {
+        __be32 qb_magic;
+        u32 __pad;
+        __be64 qb_limit;      /* Hard limit of # blocks to alloc */
+        __be64 qb_warn;       /* Warn user when alloc is above this # */
+        __be64 qb_value;       /* Current # blocks allocated */
+};
+
+struct gfs2_quota_data {
+	struct list_head qd_list;
+	unsigned int qd_count;
+
+	u32 qd_id;
+	unsigned long qd_flags;		/* QDF_... */
+
+	s64 qd_change;
+	s64 qd_change_sync;
+
+	unsigned int qd_slot;
+	unsigned int qd_slot_count;
+
+	struct buffer_head *qd_bh;
+	struct gfs2_quota_change *qd_bh_qc;
+	unsigned int qd_bh_count;
+
+	struct gfs2_glock *qd_gl;
+	struct gfs2_quota_lvb qd_qb;
+
+	u64 qd_sync_gen;
+	unsigned long qd_last_warn;
+	unsigned long qd_last_touched;
+};
+
+struct gfs2_log_buf {
+	struct list_head lb_list;
+	struct buffer_head *lb_bh;
+	struct buffer_head *lb_real;
+};
+
+struct gfs2_trans {
+	unsigned long tr_ip;
+
+	unsigned int tr_blocks;
+	unsigned int tr_revokes;
+	unsigned int tr_reserved;
+
+	struct gfs2_holder tr_t_gh;
+
+	int tr_touched;
+
+	unsigned int tr_num_buf;
+	unsigned int tr_num_buf_new;
+	unsigned int tr_num_buf_rm;
+	struct list_head tr_list_buf;
+
+	unsigned int tr_num_revoke;
+	unsigned int tr_num_revoke_rm;
+};
+
+struct gfs2_ail {
+	struct list_head ai_list;
+
+	unsigned int ai_first;
+	struct list_head ai_ail1_list;
+	struct list_head ai_ail2_list;
+
+	u64 ai_sync_gen;
+};
+
+struct gfs2_jdesc {
+	struct list_head jd_list;
+
+	struct inode *jd_inode;
+	unsigned int jd_jid;
+	int jd_dirty;
+
+	unsigned int jd_blocks;
+};
+
+#define GFS2_GLOCKD_DEFAULT	1
+#define GFS2_GLOCKD_MAX		16
+
+#define GFS2_QUOTA_DEFAULT	GFS2_QUOTA_OFF
+#define GFS2_QUOTA_OFF		0
+#define GFS2_QUOTA_ACCOUNT	1
+#define GFS2_QUOTA_ON		2
+
+#define GFS2_DATA_DEFAULT	GFS2_DATA_ORDERED
+#define GFS2_DATA_WRITEBACK	1
+#define GFS2_DATA_ORDERED	2
+
+struct gfs2_args {
+	char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
+	char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
+	char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
+	int ar_spectator; /* Don't get a journal because we're always RO */
+	int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */
+	int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */
+	int ar_localcaching; /* Local-style caching (dangerous on multihost) */
+	int ar_debug; /* Oops on errors instead of trying to be graceful */
+	int ar_upgrade; /* Upgrade ondisk/multihost format */
+	unsigned int ar_num_glockd; /* Number of glockd threads */
+	int ar_posix_acl; /* Enable posix acls */
+	int ar_quota; /* off/account/on */
+	int ar_suiddir; /* suiddir support */
+	int ar_data; /* ordered/writeback */
+};
+
+struct gfs2_tune {
+	spinlock_t gt_spin;
+
+	unsigned int gt_ilimit;
+	unsigned int gt_ilimit_tries;
+	unsigned int gt_ilimit_min;
+	unsigned int gt_demote_secs; /* Cache retention for unheld glock */
+	unsigned int gt_incore_log_blocks;
+	unsigned int gt_log_flush_secs;
+	unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
+
+	unsigned int gt_scand_secs;
+	unsigned int gt_recoverd_secs;
+	unsigned int gt_logd_secs;
+	unsigned int gt_quotad_secs;
+
+	unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
+	unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
+	unsigned int gt_quota_scale_num; /* Numerator */
+	unsigned int gt_quota_scale_den; /* Denominator */
+	unsigned int gt_quota_cache_secs;
+	unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
+	unsigned int gt_atime_quantum; /* Min secs between atime updates */
+	unsigned int gt_new_files_jdata;
+	unsigned int gt_new_files_directio;
+	unsigned int gt_max_atomic_write; /* Split big writes into this size */
+	unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
+	unsigned int gt_lockdump_size;
+	unsigned int gt_stall_secs; /* Detects trouble! */
+	unsigned int gt_complain_secs;
+	unsigned int gt_reclaim_limit; /* Max num of glocks in reclaim list */
+	unsigned int gt_entries_per_readdir;
+	unsigned int gt_prefetch_secs; /* Usage window for prefetched glocks */
+	unsigned int gt_greedy_default;
+	unsigned int gt_greedy_quantum;
+	unsigned int gt_greedy_max;
+	unsigned int gt_statfs_quantum;
+	unsigned int gt_statfs_slow;
+};
+
+enum {
+	SDF_JOURNAL_CHECKED	= 0,
+	SDF_JOURNAL_LIVE	= 1,
+	SDF_SHUTDOWN		= 2,
+	SDF_NOATIME		= 3,
+};
+
+#define GFS2_FSNAME_LEN		256
+
+struct gfs2_sbd {
+	struct super_block *sd_vfs;
+	struct super_block *sd_vfs_meta;
+	struct kobject sd_kobj;
+	unsigned long sd_flags;	/* SDF_... */
+	struct gfs2_sb sd_sb;
+
+	/* Constants computed on mount */
+
+	u32 sd_fsb2bb;
+	u32 sd_fsb2bb_shift;
+	u32 sd_diptrs;	/* Number of pointers in a dinode */
+	u32 sd_inptrs;	/* Number of pointers in a indirect block */
+	u32 sd_jbsize;	/* Size of a journaled data block */
+	u32 sd_hash_bsize;	/* sizeof(exhash block) */
+	u32 sd_hash_bsize_shift;
+	u32 sd_hash_ptrs;	/* Number of pointers in a hash block */
+	u32 sd_qc_per_block;
+	u32 sd_max_dirres;	/* Max blocks needed to add a directory entry */
+	u32 sd_max_height;	/* Max height of a file's metadata tree */
+	u64 sd_heightsize[GFS2_MAX_META_HEIGHT];
+	u32 sd_max_jheight; /* Max height of journaled file's meta tree */
+	u64 sd_jheightsize[GFS2_MAX_META_HEIGHT];
+
+	struct gfs2_args sd_args;	/* Mount arguments */
+	struct gfs2_tune sd_tune;	/* Filesystem tuning structure */
+
+	/* Lock Stuff */
+
+	struct lm_lockstruct sd_lockstruct;
+	struct list_head sd_reclaim_list;
+	spinlock_t sd_reclaim_lock;
+	wait_queue_head_t sd_reclaim_wq;
+	atomic_t sd_reclaim_count;
+	struct gfs2_holder sd_live_gh;
+	struct gfs2_glock *sd_rename_gl;
+	struct gfs2_glock *sd_trans_gl;
+
+	/* Inode Stuff */
+
+	struct inode *sd_master_dir;
+	struct inode *sd_jindex;
+	struct inode *sd_inum_inode;
+	struct inode *sd_statfs_inode;
+	struct inode *sd_ir_inode;
+	struct inode *sd_sc_inode;
+	struct inode *sd_qc_inode;
+	struct inode *sd_rindex;
+	struct inode *sd_quota_inode;
+
+	/* Inum stuff */
+
+	struct mutex sd_inum_mutex;
+
+	/* StatFS stuff */
+
+	spinlock_t sd_statfs_spin;
+	struct mutex sd_statfs_mutex;
+	struct gfs2_statfs_change sd_statfs_master;
+	struct gfs2_statfs_change sd_statfs_local;
+	unsigned long sd_statfs_sync_time;
+
+	/* Resource group stuff */
+
+	u64 sd_rindex_vn;
+	spinlock_t sd_rindex_spin;
+	struct mutex sd_rindex_mutex;
+	struct list_head sd_rindex_list;
+	struct list_head sd_rindex_mru_list;
+	struct list_head sd_rindex_recent_list;
+	struct gfs2_rgrpd *sd_rindex_forward;
+	unsigned int sd_rgrps;
+
+	/* Journal index stuff */
+
+	struct list_head sd_jindex_list;
+	spinlock_t sd_jindex_spin;
+	struct mutex sd_jindex_mutex;
+	unsigned int sd_journals;
+	unsigned long sd_jindex_refresh_time;
+
+	struct gfs2_jdesc *sd_jdesc;
+	struct gfs2_holder sd_journal_gh;
+	struct gfs2_holder sd_jinode_gh;
+
+	struct gfs2_holder sd_ir_gh;
+	struct gfs2_holder sd_sc_gh;
+	struct gfs2_holder sd_qc_gh;
+
+	/* Daemon stuff */
+
+	struct task_struct *sd_scand_process;
+	struct task_struct *sd_recoverd_process;
+	struct task_struct *sd_logd_process;
+	struct task_struct *sd_quotad_process;
+	struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
+	unsigned int sd_glockd_num;
+
+	/* Quota stuff */
+
+	struct list_head sd_quota_list;
+	atomic_t sd_quota_count;
+	spinlock_t sd_quota_spin;
+	struct mutex sd_quota_mutex;
+
+	unsigned int sd_quota_slots;
+	unsigned int sd_quota_chunks;
+	unsigned char **sd_quota_bitmap;
+
+	u64 sd_quota_sync_gen;
+	unsigned long sd_quota_sync_time;
+
+	/* Log stuff */
+
+	spinlock_t sd_log_lock;
+
+	unsigned int sd_log_blks_reserved;
+	unsigned int sd_log_commited_buf;
+	unsigned int sd_log_commited_revoke;
+
+	unsigned int sd_log_num_gl;
+	unsigned int sd_log_num_buf;
+	unsigned int sd_log_num_revoke;
+	unsigned int sd_log_num_rg;
+	unsigned int sd_log_num_databuf;
+	unsigned int sd_log_num_jdata;
+	unsigned int sd_log_num_hdrs;
+
+	struct list_head sd_log_le_gl;
+	struct list_head sd_log_le_buf;
+	struct list_head sd_log_le_revoke;
+	struct list_head sd_log_le_rg;
+	struct list_head sd_log_le_databuf;
+
+	unsigned int sd_log_blks_free;
+	struct mutex sd_log_reserve_mutex;
+
+	u64 sd_log_sequence;
+	unsigned int sd_log_head;
+	unsigned int sd_log_tail;
+	int sd_log_idle;
+
+	unsigned long sd_log_flush_time;
+	struct rw_semaphore sd_log_flush_lock;
+	struct list_head sd_log_flush_list;
+
+	unsigned int sd_log_flush_head;
+	u64 sd_log_flush_wrapped;
+
+	struct list_head sd_ail1_list;
+	struct list_head sd_ail2_list;
+	u64 sd_ail_sync_gen;
+
+	/* Replay stuff */
+
+	struct list_head sd_revoke_list;
+	unsigned int sd_replay_tail;
+
+	unsigned int sd_found_blocks;
+	unsigned int sd_found_revokes;
+	unsigned int sd_replayed_blocks;
+
+	/* For quiescing the filesystem */
+
+	struct gfs2_holder sd_freeze_gh;
+	struct mutex sd_freeze_lock;
+	unsigned int sd_freeze_count;
+
+	/* Counters */
+
+	atomic_t sd_glock_count;
+	atomic_t sd_glock_held_count;
+	atomic_t sd_inode_count;
+	atomic_t sd_reclaimed;
+
+	char sd_fsname[GFS2_FSNAME_LEN];
+	char sd_table_name[GFS2_FSNAME_LEN];
+	char sd_proto_name[GFS2_FSNAME_LEN];
+
+	/* Debugging crud */
+
+	unsigned long sd_last_warning;
+	struct vfsmount *sd_gfs2mnt;
+};
+
+#endif /* __INCORE_DOT_H__ */
+
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
new file mode 100644
index 000000000000..57c43ac47925
--- /dev/null
+++ b/fs/gfs2/inode.c
@@ -0,0 +1,1379 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/posix_acl.h>
+#include <linux/sort.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include <linux/security.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "bmap.h"
+#include "dir.h"
+#include "eattr.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "ops_address.h"
+#include "ops_file.h"
+#include "ops_inode.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+
+/**
+ * gfs2_inode_attr_in - Copy attributes from the dinode into the VFS inode
+ * @ip: The GFS2 inode (with embedded disk inode data)
+ * @inode:  The Linux VFS inode
+ *
+ */
+
+void gfs2_inode_attr_in(struct gfs2_inode *ip)
+{
+	struct inode *inode = &ip->i_inode;
+	struct gfs2_dinode *di = &ip->i_di;
+
+	inode->i_ino = ip->i_num.no_addr;
+
+	switch (di->di_mode & S_IFMT) {
+	case S_IFBLK:
+	case S_IFCHR:
+		inode->i_rdev = MKDEV(di->di_major, di->di_minor);
+		break;
+	default:
+		inode->i_rdev = 0;
+		break;
+	};
+
+	inode->i_mode = di->di_mode;
+	inode->i_nlink = di->di_nlink;
+	inode->i_uid = di->di_uid;
+	inode->i_gid = di->di_gid;
+	i_size_write(inode, di->di_size);
+	inode->i_atime.tv_sec = di->di_atime;
+	inode->i_mtime.tv_sec = di->di_mtime;
+	inode->i_ctime.tv_sec = di->di_ctime;
+	inode->i_atime.tv_nsec = 0;
+	inode->i_mtime.tv_nsec = 0;
+	inode->i_ctime.tv_nsec = 0;
+	inode->i_blocks = di->di_blocks <<
+		(GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
+
+	if (di->di_flags & GFS2_DIF_IMMUTABLE)
+		inode->i_flags |= S_IMMUTABLE;
+	else
+		inode->i_flags &= ~S_IMMUTABLE;
+
+	if (di->di_flags & GFS2_DIF_APPENDONLY)
+		inode->i_flags |= S_APPEND;
+	else
+		inode->i_flags &= ~S_APPEND;
+}
+
+/**
+ * gfs2_inode_attr_out - Copy attributes from VFS inode into the dinode
+ * @ip: The GFS2 inode
+ *
+ * Only copy out the attributes that we want the VFS layer
+ * to be able to modify.
+ */
+
+void gfs2_inode_attr_out(struct gfs2_inode *ip)
+{
+	struct inode *inode = &ip->i_inode;
+	struct gfs2_dinode *di = &ip->i_di;
+	gfs2_assert_withdraw(GFS2_SB(inode),
+		(di->di_mode & S_IFMT) == (inode->i_mode & S_IFMT));
+	di->di_mode = inode->i_mode;
+	di->di_uid = inode->i_uid;
+	di->di_gid = inode->i_gid;
+	di->di_atime = inode->i_atime.tv_sec;
+	di->di_mtime = inode->i_mtime.tv_sec;
+	di->di_ctime = inode->i_ctime.tv_sec;
+}
+
+static int iget_test(struct inode *inode, void *opaque)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_inum *inum = opaque;
+
+	if (ip && ip->i_num.no_addr == inum->no_addr)
+		return 1;
+
+	return 0;
+}
+
+static int iget_set(struct inode *inode, void *opaque)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_inum *inum = opaque;
+
+	ip->i_num = *inum;
+	return 0;
+}
+
+struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum)
+{
+	return ilookup5(sb, (unsigned long)inum->no_formal_ino,
+			iget_test, inum);
+}
+
+static struct inode *gfs2_iget(struct super_block *sb, struct gfs2_inum *inum)
+{
+	return iget5_locked(sb, (unsigned long)inum->no_formal_ino,
+		     iget_test, iget_set, inum);
+}
+
+/**
+ * gfs2_inode_lookup - Lookup an inode
+ * @sb: The super block
+ * @inum: The inode number
+ * @type: The type of the inode
+ *
+ * Returns: A VFS inode, or an error
+ */
+
+struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned int type)
+{
+	struct inode *inode = gfs2_iget(sb, inum);
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_glock *io_gl;
+	int error;
+
+	if (inode->i_state & I_NEW) {
+		struct gfs2_sbd *sdp = GFS2_SB(inode);
+		umode_t mode = DT2IF(type);
+		inode->i_private = ip;
+		inode->i_mode = mode;
+
+		if (S_ISREG(mode)) {
+			inode->i_op = &gfs2_file_iops;
+			inode->i_fop = &gfs2_file_fops;
+			inode->i_mapping->a_ops = &gfs2_file_aops;
+		} else if (S_ISDIR(mode)) {
+			inode->i_op = &gfs2_dir_iops;
+			inode->i_fop = &gfs2_dir_fops;
+		} else if (S_ISLNK(mode)) {
+			inode->i_op = &gfs2_symlink_iops;
+		} else {
+			inode->i_op = &gfs2_dev_iops;
+		}
+
+		error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
+		if (unlikely(error))
+			goto fail;
+		ip->i_gl->gl_object = ip;
+
+		error = gfs2_glock_get(sdp, inum->no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
+		if (unlikely(error))
+			goto fail_put;
+
+		ip->i_vn = ip->i_gl->gl_vn - 1;
+		error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
+		if (unlikely(error))
+			goto fail_iopen;
+
+		gfs2_glock_put(io_gl);
+		unlock_new_inode(inode);
+	}
+
+	return inode;
+fail_iopen:
+	gfs2_glock_put(io_gl);
+fail_put:
+	ip->i_gl->gl_object = NULL;
+	gfs2_glock_put(ip->i_gl);
+fail:
+	iput(inode);
+	return ERR_PTR(error);
+}
+
+/**
+ * gfs2_inode_refresh - Refresh the incore copy of the dinode
+ * @ip: The GFS2 inode
+ *
+ * Returns: errno
+ */
+
+int gfs2_inode_refresh(struct gfs2_inode *ip)
+{
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		return error;
+
+	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) {
+		brelse(dibh);
+		return -EIO;
+	}
+
+	gfs2_dinode_in(&ip->i_di, dibh->b_data);
+
+	brelse(dibh);
+
+	if (ip->i_num.no_addr != ip->i_di.di_num.no_addr) {
+		if (gfs2_consist_inode(ip))
+			gfs2_dinode_print(&ip->i_di);
+		return -EIO;
+	}
+	if (ip->i_num.no_formal_ino != ip->i_di.di_num.no_formal_ino)
+		return -ESTALE;
+
+	ip->i_vn = ip->i_gl->gl_vn;
+
+	return 0;
+}
+
+int gfs2_dinode_dealloc(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al;
+	struct gfs2_rgrpd *rgd;
+	int error;
+
+	if (ip->i_di.di_blocks != 1) {
+		if (gfs2_consist_inode(ip))
+			gfs2_dinode_print(&ip->i_di);
+		return -EIO;
+	}
+
+	al = gfs2_alloc_get(ip);
+
+	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto out;
+
+	error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+	if (error)
+		goto out_qs;
+
+	rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
+	if (!rgd) {
+		gfs2_consist_inode(ip);
+		error = -EIO;
+		goto out_rindex_relse;
+	}
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
+				   &al->al_rgd_gh);
+	if (error)
+		goto out_rindex_relse;
+
+	error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 1);
+	if (error)
+		goto out_rg_gunlock;
+
+	gfs2_trans_add_gl(ip->i_gl);
+
+	gfs2_free_di(rgd, ip);
+
+	gfs2_trans_end(sdp);
+	clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
+
+out_rg_gunlock:
+	gfs2_glock_dq_uninit(&al->al_rgd_gh);
+out_rindex_relse:
+	gfs2_glock_dq_uninit(&al->al_ri_gh);
+out_qs:
+	gfs2_quota_unhold(ip);
+out:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+/**
+ * gfs2_change_nlink - Change nlink count on inode
+ * @ip: The GFS2 inode
+ * @diff: The change in the nlink count required
+ *
+ * Returns: errno
+ */
+
+int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
+{
+	struct gfs2_sbd *sdp = ip->i_inode.i_sb->s_fs_info;
+	struct buffer_head *dibh;
+	u32 nlink;
+	int error;
+
+	BUG_ON(ip->i_di.di_nlink != ip->i_inode.i_nlink);
+	nlink = ip->i_di.di_nlink + diff;
+
+	/* If we are reducing the nlink count, but the new value ends up being
+	   bigger than the old one, we must have underflowed. */
+	if (diff < 0 && nlink > ip->i_di.di_nlink) {
+		if (gfs2_consist_inode(ip))
+			gfs2_dinode_print(&ip->i_di);
+		return -EIO;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		return error;
+
+	ip->i_di.di_nlink = nlink;
+	ip->i_di.di_ctime = get_seconds();
+	ip->i_inode.i_nlink = nlink;
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(&ip->i_di, dibh->b_data);
+	brelse(dibh);
+	mark_inode_dirty(&ip->i_inode);
+
+	if (ip->i_di.di_nlink == 0) {
+		struct gfs2_rgrpd *rgd;
+		struct gfs2_holder ri_gh, rg_gh;
+
+		error = gfs2_rindex_hold(sdp, &ri_gh);
+		if (error)
+			goto out;
+		error = -EIO;
+		rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
+		if (!rgd)
+			goto out_norgrp;
+		error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
+		if (error)
+			goto out_norgrp;
+
+		clear_nlink(&ip->i_inode);
+		gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */
+		gfs2_glock_dq_uninit(&rg_gh);
+out_norgrp:
+		gfs2_glock_dq_uninit(&ri_gh);
+	}
+out:
+	return error;
+}
+
+struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
+{
+	struct qstr qstr;
+	gfs2_str2qstr(&qstr, name);
+	return gfs2_lookupi(dip, &qstr, 1, NULL);
+}
+
+
+/**
+ * gfs2_lookupi - Look up a filename in a directory and return its inode
+ * @d_gh: An initialized holder for the directory glock
+ * @name: The name of the inode to look for
+ * @is_root: If 1, ignore the caller's permissions
+ * @i_gh: An uninitialized holder for the new inode glock
+ *
+ * There will always be a vnode (Linux VFS inode) for the d_gh inode unless
+ * @is_root is true.
+ *
+ * Returns: errno
+ */
+
+struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+			   int is_root, struct nameidata *nd)
+{
+	struct super_block *sb = dir->i_sb;
+	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_holder d_gh;
+	struct gfs2_inum inum;
+	unsigned int type;
+	int error = 0;
+	struct inode *inode = NULL;
+
+	if (!name->len || name->len > GFS2_FNAMESIZE)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) ||
+	    (name->len == 2 && memcmp(name->name, "..", 2) == 0 &&
+	     dir == sb->s_root->d_inode)) {
+		igrab(dir);
+		return dir;
+	}
+
+	error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+	if (error)
+		return ERR_PTR(error);
+
+	if (!is_root) {
+		error = permission(dir, MAY_EXEC, NULL);
+		if (error)
+			goto out;
+	}
+
+	error = gfs2_dir_search(dir, name, &inum, &type);
+	if (error)
+		goto out;
+
+	inode = gfs2_inode_lookup(sb, &inum, type);
+
+out:
+	gfs2_glock_dq_uninit(&d_gh);
+	if (error == -ENOENT)
+		return NULL;
+	return inode;
+}
+
+static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
+{
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
+	struct buffer_head *bh;
+	struct gfs2_inum_range ir;
+	int error;
+
+	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (error)
+		return error;
+	mutex_lock(&sdp->sd_inum_mutex);
+
+	error = gfs2_meta_inode_buffer(ip, &bh);
+	if (error) {
+		mutex_unlock(&sdp->sd_inum_mutex);
+		gfs2_trans_end(sdp);
+		return error;
+	}
+
+	gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
+
+	if (ir.ir_length) {
+		*formal_ino = ir.ir_start++;
+		ir.ir_length--;
+		gfs2_trans_add_bh(ip->i_gl, bh, 1);
+		gfs2_inum_range_out(&ir,
+				    bh->b_data + sizeof(struct gfs2_dinode));
+		brelse(bh);
+		mutex_unlock(&sdp->sd_inum_mutex);
+		gfs2_trans_end(sdp);
+		return 0;
+	}
+
+	brelse(bh);
+
+	mutex_unlock(&sdp->sd_inum_mutex);
+	gfs2_trans_end(sdp);
+
+	return 1;
+}
+
+static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino)
+{
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
+	struct gfs2_holder gh;
+	struct buffer_head *bh;
+	struct gfs2_inum_range ir;
+	int error;
+
+	error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (error)
+		return error;
+
+	error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
+	if (error)
+		goto out;
+	mutex_lock(&sdp->sd_inum_mutex);
+
+	error = gfs2_meta_inode_buffer(ip, &bh);
+	if (error)
+		goto out_end_trans;
+
+	gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
+
+	if (!ir.ir_length) {
+		struct buffer_head *m_bh;
+		u64 x, y;
+
+		error = gfs2_meta_inode_buffer(m_ip, &m_bh);
+		if (error)
+			goto out_brelse;
+
+		x = *(u64 *)(m_bh->b_data + sizeof(struct gfs2_dinode));
+		x = y = be64_to_cpu(x);
+		ir.ir_start = x;
+		ir.ir_length = GFS2_INUM_QUANTUM;
+		x += GFS2_INUM_QUANTUM;
+		if (x < y)
+			gfs2_consist_inode(m_ip);
+		x = cpu_to_be64(x);
+		gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
+		*(u64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = x;
+
+		brelse(m_bh);
+	}
+
+	*formal_ino = ir.ir_start++;
+	ir.ir_length--;
+
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
+
+out_brelse:
+	brelse(bh);
+out_end_trans:
+	mutex_unlock(&sdp->sd_inum_mutex);
+	gfs2_trans_end(sdp);
+out:
+	gfs2_glock_dq_uninit(&gh);
+	return error;
+}
+
+static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum)
+{
+	int error;
+
+	error = pick_formal_ino_1(sdp, inum);
+	if (error <= 0)
+		return error;
+
+	error = pick_formal_ino_2(sdp, inum);
+
+	return error;
+}
+
+/**
+ * create_ok - OK to create a new on-disk inode here?
+ * @dip:  Directory in which dinode is to be created
+ * @name:  Name of new dinode
+ * @mode:
+ *
+ * Returns: errno
+ */
+
+static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
+		     unsigned int mode)
+{
+	int error;
+
+	error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+	if (error)
+		return error;
+
+	/*  Don't create entries in an unlinked directory  */
+	if (!dip->i_di.di_nlink)
+		return -EPERM;
+
+	error = gfs2_dir_search(&dip->i_inode, name, NULL, NULL);
+	switch (error) {
+	case -ENOENT:
+		error = 0;
+		break;
+	case 0:
+		return -EEXIST;
+	default:
+		return error;
+	}
+
+	if (dip->i_di.di_entries == (u32)-1)
+		return -EFBIG;
+	if (S_ISDIR(mode) && dip->i_di.di_nlink == (u32)-1)
+		return -EMLINK;
+
+	return 0;
+}
+
+static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
+			       unsigned int *uid, unsigned int *gid)
+{
+	if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
+	    (dip->i_di.di_mode & S_ISUID) && dip->i_di.di_uid) {
+		if (S_ISDIR(*mode))
+			*mode |= S_ISUID;
+		else if (dip->i_di.di_uid != current->fsuid)
+			*mode &= ~07111;
+		*uid = dip->i_di.di_uid;
+	} else
+		*uid = current->fsuid;
+
+	if (dip->i_di.di_mode & S_ISGID) {
+		if (S_ISDIR(*mode))
+			*mode |= S_ISGID;
+		*gid = dip->i_di.di_gid;
+	} else
+		*gid = current->fsgid;
+}
+
+static int alloc_dinode(struct gfs2_inode *dip, struct gfs2_inum *inum,
+			u64 *generation)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	int error;
+
+	gfs2_alloc_get(dip);
+
+	dip->i_alloc.al_requested = RES_DINODE;
+	error = gfs2_inplace_reserve(dip);
+	if (error)
+		goto out;
+
+	error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS, 0);
+	if (error)
+		goto out_ipreserv;
+
+	inum->no_addr = gfs2_alloc_di(dip, generation);
+
+	gfs2_trans_end(sdp);
+
+out_ipreserv:
+	gfs2_inplace_release(dip);
+out:
+	gfs2_alloc_put(dip);
+	return error;
+}
+
+/**
+ * init_dinode - Fill in a new dinode structure
+ * @dip: the directory this inode is being created in
+ * @gl: The glock covering the new inode
+ * @inum: the inode number
+ * @mode: the file permissions
+ * @uid:
+ * @gid:
+ *
+ */
+
+static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
+			const struct gfs2_inum *inum, unsigned int mode,
+			unsigned int uid, unsigned int gid,
+			const u64 *generation)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct gfs2_dinode *di;
+	struct buffer_head *dibh;
+
+	dibh = gfs2_meta_new(gl, inum->no_addr);
+	gfs2_trans_add_bh(gl, dibh, 1);
+	gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
+	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+	di = (struct gfs2_dinode *)dibh->b_data;
+
+	di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino);
+	di->di_num.no_addr = cpu_to_be64(inum->no_addr);
+	di->di_mode = cpu_to_be32(mode);
+	di->di_uid = cpu_to_be32(uid);
+	di->di_gid = cpu_to_be32(gid);
+	di->di_nlink = cpu_to_be32(0);
+	di->di_size = cpu_to_be64(0);
+	di->di_blocks = cpu_to_be64(1);
+	di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(get_seconds());
+	di->di_major = di->di_minor = cpu_to_be32(0);
+	di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
+	di->di_generation = cpu_to_be64(*generation);
+	di->di_flags = cpu_to_be32(0);
+
+	if (S_ISREG(mode)) {
+		if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
+		    gfs2_tune_get(sdp, gt_new_files_jdata))
+			di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
+		if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
+		    gfs2_tune_get(sdp, gt_new_files_directio))
+			di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
+	} else if (S_ISDIR(mode)) {
+		di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
+					    GFS2_DIF_INHERIT_DIRECTIO);
+		di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
+					    GFS2_DIF_INHERIT_JDATA);
+	}
+
+	di->__pad1 = 0;
+	di->di_payload_format = cpu_to_be32(0);
+	di->di_height = cpu_to_be32(0);
+	di->__pad2 = 0;
+	di->__pad3 = 0;
+	di->di_depth = cpu_to_be16(0);
+	di->di_entries = cpu_to_be32(0);
+	memset(&di->__pad4, 0, sizeof(di->__pad4));
+	di->di_eattr = cpu_to_be64(0);
+	memset(&di->di_reserved, 0, sizeof(di->di_reserved));
+
+	brelse(dibh);
+}
+
+static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
+		       unsigned int mode, const struct gfs2_inum *inum,
+		       const u64 *generation)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	unsigned int uid, gid;
+	int error;
+
+	munge_mode_uid_gid(dip, &mode, &uid, &gid);
+	gfs2_alloc_get(dip);
+
+	error = gfs2_quota_lock(dip, uid, gid);
+	if (error)
+		goto out;
+
+	error = gfs2_quota_check(dip, uid, gid);
+	if (error)
+		goto out_quota;
+
+	error = gfs2_trans_begin(sdp, RES_DINODE + RES_QUOTA, 0);
+	if (error)
+		goto out_quota;
+
+	init_dinode(dip, gl, inum, mode, uid, gid, generation);
+	gfs2_quota_change(dip, +1, uid, gid);
+	gfs2_trans_end(sdp);
+
+out_quota:
+	gfs2_quota_unlock(dip);
+out:
+	gfs2_alloc_put(dip);
+	return error;
+}
+
+static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
+		       struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct gfs2_alloc *al;
+	int alloc_required;
+	struct buffer_head *dibh;
+	int error;
+
+	al = gfs2_alloc_get(dip);
+
+	error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto fail;
+
+	error = alloc_required = gfs2_diradd_alloc_required(&dip->i_inode, name);
+	if (alloc_required < 0)
+		goto fail;
+	if (alloc_required) {
+		error = gfs2_quota_check(dip, dip->i_di.di_uid,
+					 dip->i_di.di_gid);
+		if (error)
+			goto fail_quota_locks;
+
+		al->al_requested = sdp->sd_max_dirres;
+
+		error = gfs2_inplace_reserve(dip);
+		if (error)
+			goto fail_quota_locks;
+
+		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+					 al->al_rgd->rd_ri.ri_length +
+					 2 * RES_DINODE +
+					 RES_STATFS + RES_QUOTA, 0);
+		if (error)
+			goto fail_ipreserv;
+	} else {
+		error = gfs2_trans_begin(sdp, RES_LEAF + 2 * RES_DINODE, 0);
+		if (error)
+			goto fail_quota_locks;
+	}
+
+	error = gfs2_dir_add(&dip->i_inode, name, &ip->i_num, IF2DT(ip->i_di.di_mode));
+	if (error)
+		goto fail_end_trans;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto fail_end_trans;
+	ip->i_di.di_nlink = 1;
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(&ip->i_di, dibh->b_data);
+	brelse(dibh);
+	return 0;
+
+fail_end_trans:
+	gfs2_trans_end(sdp);
+
+fail_ipreserv:
+	if (dip->i_alloc.al_rgd)
+		gfs2_inplace_release(dip);
+
+fail_quota_locks:
+	gfs2_quota_unlock(dip);
+
+fail:
+	gfs2_alloc_put(dip);
+	return error;
+}
+
+static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
+{
+	int err;
+	size_t len;
+	void *value;
+	char *name;
+	struct gfs2_ea_request er;
+
+	err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
+					   &name, &value, &len);
+
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+
+	memset(&er, 0, sizeof(struct gfs2_ea_request));
+
+	er.er_type = GFS2_EATYPE_SECURITY;
+	er.er_name = name;
+	er.er_data = value;
+	er.er_name_len = strlen(name);
+	er.er_data_len = len;
+
+	err = gfs2_ea_set_i(ip, &er);
+
+	kfree(value);
+	kfree(name);
+
+	return err;
+}
+
+/**
+ * gfs2_createi - Create a new inode
+ * @ghs: An array of two holders
+ * @name: The name of the new file
+ * @mode: the permissions on the new inode
+ *
+ * @ghs[0] is an initialized holder for the directory
+ * @ghs[1] is the holder for the inode lock
+ *
+ * If the return value is not NULL, the glocks on both the directory and the new
+ * file are held.  A transaction has been started and an inplace reservation
+ * is held, as well.
+ *
+ * Returns: An inode
+ */
+
+struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
+			   unsigned int mode)
+{
+	struct inode *inode;
+	struct gfs2_inode *dip = ghs->gh_gl->gl_object;
+	struct inode *dir = &dip->i_inode;
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct gfs2_inum inum;
+	int error;
+	u64 generation;
+
+	if (!name->len || name->len > GFS2_FNAMESIZE)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
+	error = gfs2_glock_nq(ghs);
+	if (error)
+		goto fail;
+
+	error = create_ok(dip, name, mode);
+	if (error)
+		goto fail_gunlock;
+
+	error = pick_formal_ino(sdp, &inum.no_formal_ino);
+	if (error)
+		goto fail_gunlock;
+
+	error = alloc_dinode(dip, &inum, &generation);
+	if (error)
+		goto fail_gunlock;
+
+	if (inum.no_addr < dip->i_num.no_addr) {
+		gfs2_glock_dq(ghs);
+
+		error = gfs2_glock_nq_num(sdp, inum.no_addr,
+					  &gfs2_inode_glops, LM_ST_EXCLUSIVE,
+					  GL_SKIP, ghs + 1);
+		if (error) {
+			return ERR_PTR(error);
+		}
+
+		gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
+		error = gfs2_glock_nq(ghs);
+		if (error) {
+			gfs2_glock_dq_uninit(ghs + 1);
+			return ERR_PTR(error);
+		}
+
+		error = create_ok(dip, name, mode);
+		if (error)
+			goto fail_gunlock2;
+	} else {
+		error = gfs2_glock_nq_num(sdp, inum.no_addr,
+					  &gfs2_inode_glops, LM_ST_EXCLUSIVE,
+					  GL_SKIP, ghs + 1);
+		if (error)
+			goto fail_gunlock;
+	}
+
+	error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation);
+	if (error)
+		goto fail_gunlock2;
+
+	inode = gfs2_inode_lookup(dir->i_sb, &inum, IF2DT(mode));
+	if (IS_ERR(inode))
+		goto fail_gunlock2;
+
+	error = gfs2_inode_refresh(GFS2_I(inode));
+	if (error)
+		goto fail_iput;
+
+	error = gfs2_acl_create(dip, GFS2_I(inode));
+	if (error)
+		goto fail_iput;
+
+	error = gfs2_security_init(dip, GFS2_I(inode));
+	if (error)
+		goto fail_iput;
+
+	error = link_dinode(dip, name, GFS2_I(inode));
+	if (error)
+		goto fail_iput;
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	return inode;
+
+fail_iput:
+	iput(inode);
+fail_gunlock2:
+	gfs2_glock_dq_uninit(ghs + 1);
+fail_gunlock:
+	gfs2_glock_dq(ghs);
+fail:
+	return ERR_PTR(error);
+}
+
+/**
+ * gfs2_rmdiri - Remove a directory
+ * @dip: The parent directory of the directory to be removed
+ * @name: The name of the directory to be removed
+ * @ip: The GFS2 inode of the directory to be removed
+ *
+ * Assumes Glocks on dip and ip are held
+ *
+ * Returns: errno
+ */
+
+int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+		struct gfs2_inode *ip)
+{
+	struct qstr dotname;
+	int error;
+
+	if (ip->i_di.di_entries != 2) {
+		if (gfs2_consist_inode(ip))
+			gfs2_dinode_print(&ip->i_di);
+		return -EIO;
+	}
+
+	error = gfs2_dir_del(dip, name);
+	if (error)
+		return error;
+
+	error = gfs2_change_nlink(dip, -1);
+	if (error)
+		return error;
+
+	gfs2_str2qstr(&dotname, ".");
+	error = gfs2_dir_del(ip, &dotname);
+	if (error)
+		return error;
+
+	gfs2_str2qstr(&dotname, "..");
+	error = gfs2_dir_del(ip, &dotname);
+	if (error)
+		return error;
+
+	error = gfs2_change_nlink(ip, -2);
+	if (error)
+		return error;
+
+	return error;
+}
+
+/*
+ * gfs2_unlink_ok - check to see that a inode is still in a directory
+ * @dip: the directory
+ * @name: the name of the file
+ * @ip: the inode
+ *
+ * Assumes that the lock on (at least) @dip is held.
+ *
+ * Returns: 0 if the parent/child relationship is correct, errno if it isn't
+ */
+
+int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+		   struct gfs2_inode *ip)
+{
+	struct gfs2_inum inum;
+	unsigned int type;
+	int error;
+
+	if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+		return -EPERM;
+
+	if ((dip->i_di.di_mode & S_ISVTX) &&
+	    dip->i_di.di_uid != current->fsuid &&
+	    ip->i_di.di_uid != current->fsuid && !capable(CAP_FOWNER))
+		return -EPERM;
+
+	if (IS_APPEND(&dip->i_inode))
+		return -EPERM;
+
+	error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+	if (error)
+		return error;
+
+	error = gfs2_dir_search(&dip->i_inode, name, &inum, &type);
+	if (error)
+		return error;
+
+	if (!gfs2_inum_equal(&inum, &ip->i_num))
+		return -ENOENT;
+
+	if (IF2DT(ip->i_di.di_mode) != type) {
+		gfs2_consist_inode(dip);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/*
+ * gfs2_ok_to_move - check if it's ok to move a directory to another directory
+ * @this: move this
+ * @to: to here
+ *
+ * Follow @to back to the root and make sure we don't encounter @this
+ * Assumes we already hold the rename lock.
+ *
+ * Returns: errno
+ */
+
+int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
+{
+	struct inode *dir = &to->i_inode;
+	struct super_block *sb = dir->i_sb;
+	struct inode *tmp;
+	struct qstr dotdot;
+	int error = 0;
+
+	gfs2_str2qstr(&dotdot, "..");
+
+	igrab(dir);
+
+	for (;;) {
+		if (dir == &this->i_inode) {
+			error = -EINVAL;
+			break;
+		}
+		if (dir == sb->s_root->d_inode) {
+			error = 0;
+			break;
+		}
+
+		tmp = gfs2_lookupi(dir, &dotdot, 1, NULL);
+		if (IS_ERR(tmp)) {
+			error = PTR_ERR(tmp);
+			break;
+		}
+
+		iput(dir);
+		dir = tmp;
+	}
+
+	iput(dir);
+
+	return error;
+}
+
+/**
+ * gfs2_readlinki - return the contents of a symlink
+ * @ip: the symlink's inode
+ * @buf: a pointer to the buffer to be filled
+ * @len: a pointer to the length of @buf
+ *
+ * If @buf is too small, a piece of memory is kmalloc()ed and needs
+ * to be freed by the caller.
+ *
+ * Returns: errno
+ */
+
+int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
+{
+	struct gfs2_holder i_gh;
+	struct buffer_head *dibh;
+	unsigned int x;
+	int error;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
+	error = gfs2_glock_nq_atime(&i_gh);
+	if (error) {
+		gfs2_holder_uninit(&i_gh);
+		return error;
+	}
+
+	if (!ip->i_di.di_size) {
+		gfs2_consist_inode(ip);
+		error = -EIO;
+		goto out;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out;
+
+	x = ip->i_di.di_size + 1;
+	if (x > *len) {
+		*buf = kmalloc(x, GFP_KERNEL);
+		if (!*buf) {
+			error = -ENOMEM;
+			goto out_brelse;
+		}
+	}
+
+	memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
+	*len = x;
+
+out_brelse:
+	brelse(dibh);
+out:
+	gfs2_glock_dq_uninit(&i_gh);
+	return error;
+}
+
+/**
+ * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
+ *       conditionally update the inode's atime
+ * @gh: the holder to acquire
+ *
+ * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
+ * Update if the difference between the current time and the inode's current
+ * atime is greater than an interval specified at mount.
+ *
+ * Returns: errno
+ */
+
+int gfs2_glock_nq_atime(struct gfs2_holder *gh)
+{
+	struct gfs2_glock *gl = gh->gh_gl;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_inode *ip = gl->gl_object;
+	s64 curtime, quantum = gfs2_tune_get(sdp, gt_atime_quantum);
+	unsigned int state;
+	int flags;
+	int error;
+
+	if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
+	    gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
+	    gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
+		return -EINVAL;
+
+	state = gh->gh_state;
+	flags = gh->gh_flags;
+
+	error = gfs2_glock_nq(gh);
+	if (error)
+		return error;
+
+	if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
+	    (sdp->sd_vfs->s_flags & MS_RDONLY))
+		return 0;
+
+	curtime = get_seconds();
+	if (curtime - ip->i_di.di_atime >= quantum) {
+		gfs2_glock_dq(gh);
+		gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
+				   gh);
+		error = gfs2_glock_nq(gh);
+		if (error)
+			return error;
+
+		/* Verify that atime hasn't been updated while we were
+		   trying to get exclusive lock. */
+
+		curtime = get_seconds();
+		if (curtime - ip->i_di.di_atime >= quantum) {
+			struct buffer_head *dibh;
+			struct gfs2_dinode *di;
+
+			error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+			if (error == -EROFS)
+				return 0;
+			if (error)
+				goto fail;
+
+			error = gfs2_meta_inode_buffer(ip, &dibh);
+			if (error)
+				goto fail_end_trans;
+
+			ip->i_di.di_atime = curtime;
+
+			gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+			di = (struct gfs2_dinode *)dibh->b_data;
+			di->di_atime = cpu_to_be64(ip->i_di.di_atime);
+			brelse(dibh);
+
+			gfs2_trans_end(sdp);
+		}
+
+		/* If someone else has asked for the glock,
+		   unlock and let them have it. Then reacquire
+		   in the original state. */
+		if (gfs2_glock_is_blocking(gl)) {
+			gfs2_glock_dq(gh);
+			gfs2_holder_reinit(state, flags, gh);
+			return gfs2_glock_nq(gh);
+		}
+	}
+
+	return 0;
+
+fail_end_trans:
+	gfs2_trans_end(sdp);
+fail:
+	gfs2_glock_dq(gh);
+	return error;
+}
+
+/**
+ * glock_compare_atime - Compare two struct gfs2_glock structures for sort
+ * @arg_a: the first structure
+ * @arg_b: the second structure
+ *
+ * Returns: 1 if A > B
+ *         -1 if A < B
+ *          0 if A == B
+ */
+
+static int glock_compare_atime(const void *arg_a, const void *arg_b)
+{
+	const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a;
+	const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b;
+	const struct lm_lockname *a = &gh_a->gh_gl->gl_name;
+	const struct lm_lockname *b = &gh_b->gh_gl->gl_name;
+
+	if (a->ln_number > b->ln_number)
+		return 1;
+	if (a->ln_number < b->ln_number)
+		return -1;
+	if (gh_a->gh_state == LM_ST_SHARED && gh_b->gh_state == LM_ST_EXCLUSIVE)
+		return 1;
+	if (gh_a->gh_state == LM_ST_SHARED && (gh_b->gh_flags & GL_ATIME))
+		return 1;
+
+	return 0;
+}
+
+/**
+ * gfs2_glock_nq_m_atime - acquire multiple glocks where one may need an
+ *      atime update
+ * @num_gh: the number of structures
+ * @ghs: an array of struct gfs2_holder structures
+ *
+ * Returns: 0 on success (all glocks acquired),
+ *          errno on failure (no glocks acquired)
+ */
+
+int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs)
+{
+	struct gfs2_holder **p;
+	unsigned int x;
+	int error = 0;
+
+	if (!num_gh)
+		return 0;
+
+	if (num_gh == 1) {
+		ghs->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+		if (ghs->gh_flags & GL_ATIME)
+			error = gfs2_glock_nq_atime(ghs);
+		else
+			error = gfs2_glock_nq(ghs);
+		return error;
+	}
+
+	p = kcalloc(num_gh, sizeof(struct gfs2_holder *), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	for (x = 0; x < num_gh; x++)
+		p[x] = &ghs[x];
+
+	sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare_atime,NULL);
+
+	for (x = 0; x < num_gh; x++) {
+		p[x]->gh_flags &= ~(LM_FLAG_TRY | GL_ASYNC);
+
+		if (p[x]->gh_flags & GL_ATIME)
+			error = gfs2_glock_nq_atime(p[x]);
+		else
+			error = gfs2_glock_nq(p[x]);
+
+		if (error) {
+			while (x--)
+				gfs2_glock_dq(p[x]);
+			break;
+		}
+	}
+
+	kfree(p);
+	return error;
+}
+
+
+static int
+__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+{
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (!error) {
+		error = inode_setattr(&ip->i_inode, attr);
+		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
+		gfs2_inode_attr_out(ip);
+
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		brelse(dibh);
+	}
+	return error;
+}
+
+/**
+ * gfs2_setattr_simple -
+ * @ip:
+ * @attr:
+ *
+ * Called with a reference on the vnode.
+ *
+ * Returns: errno
+ */
+
+int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+{
+	int error;
+
+	if (current->journal_info)
+		return __gfs2_setattr_simple(ip, attr);
+
+	error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE, 0);
+	if (error)
+		return error;
+
+	error = __gfs2_setattr_simple(ip, attr);
+	gfs2_trans_end(GFS2_SB(&ip->i_inode));
+	return error;
+}
+
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
new file mode 100644
index 000000000000..f5d861760579
--- /dev/null
+++ b/fs/gfs2/inode.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __INODE_DOT_H__
+#define __INODE_DOT_H__
+
+static inline int gfs2_is_stuffed(struct gfs2_inode *ip)
+{
+	return !ip->i_di.di_height;
+}
+
+static inline int gfs2_is_jdata(struct gfs2_inode *ip)
+{
+	return ip->i_di.di_flags & GFS2_DIF_JDATA;
+}
+
+static inline int gfs2_is_dir(struct gfs2_inode *ip)
+{
+	return S_ISDIR(ip->i_di.di_mode);
+}
+
+void gfs2_inode_attr_in(struct gfs2_inode *ip);
+void gfs2_inode_attr_out(struct gfs2_inode *ip);
+struct inode *gfs2_inode_lookup(struct super_block *sb, struct gfs2_inum *inum, unsigned type);
+struct inode *gfs2_ilookup(struct super_block *sb, struct gfs2_inum *inum);
+
+int gfs2_inode_refresh(struct gfs2_inode *ip);
+
+int gfs2_dinode_dealloc(struct gfs2_inode *inode);
+int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
+struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+			   int is_root, struct nameidata *nd);
+struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
+			   unsigned int mode);
+int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+		struct gfs2_inode *ip);
+int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+		   struct gfs2_inode *ip);
+int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
+int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
+
+int gfs2_glock_nq_atime(struct gfs2_holder *gh);
+int gfs2_glock_nq_m_atime(unsigned int num_gh, struct gfs2_holder *ghs);
+
+int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
+
+struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
+
+#endif /* __INODE_DOT_H__ */
+
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
new file mode 100644
index 000000000000..effe4a337c1d
--- /dev/null
+++ b/fs/gfs2/lm.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/delay.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "lm.h"
+#include "super.h"
+#include "util.h"
+
+/**
+ * gfs2_lm_mount - mount a locking protocol
+ * @sdp: the filesystem
+ * @args: mount arguements
+ * @silent: if 1, don't complain if the FS isn't a GFS2 fs
+ *
+ * Returns: errno
+ */
+
+int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
+{
+	char *proto = sdp->sd_proto_name;
+	char *table = sdp->sd_table_name;
+	int flags = 0;
+	int error;
+
+	if (sdp->sd_args.ar_spectator)
+		flags |= LM_MFLAG_SPECTATOR;
+
+	fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
+
+	error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
+				     gfs2_glock_cb, sdp,
+				     GFS2_MIN_LVB_SIZE, flags,
+				     &sdp->sd_lockstruct, &sdp->sd_kobj);
+	if (error) {
+		fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
+			proto, table, sdp->sd_args.ar_hostdata);
+		goto out;
+	}
+
+	if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
+	    gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
+	    gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
+				  GFS2_MIN_LVB_SIZE)) {
+		gfs2_unmount_lockproto(&sdp->sd_lockstruct);
+		goto out;
+	}
+
+	if (sdp->sd_args.ar_spectator)
+		snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
+	else
+		snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
+			 sdp->sd_lockstruct.ls_jid);
+
+	fs_info(sdp, "Joined cluster. Now mounting FS...\n");
+
+	if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
+	    !sdp->sd_args.ar_ignore_local_fs) {
+		sdp->sd_args.ar_localflocks = 1;
+		sdp->sd_args.ar_localcaching = 1;
+	}
+
+out:
+	return error;
+}
+
+void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
+{
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
+					sdp->sd_lockstruct.ls_lockspace);
+}
+
+void gfs2_lm_unmount(struct gfs2_sbd *sdp)
+{
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		gfs2_unmount_lockproto(&sdp->sd_lockstruct);
+}
+
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
+{
+	va_list args;
+
+	if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		return 0;
+
+	va_start(args, fmt);
+	vprintk(fmt, args);
+	va_end(args);
+
+	fs_err(sdp, "about to withdraw from the cluster\n");
+	BUG_ON(sdp->sd_args.ar_debug);
+
+
+	fs_err(sdp, "waiting for outstanding I/O\n");
+
+	/* FIXME: suspend dm device so oustanding bio's complete
+	   and all further io requests fail */
+
+	fs_err(sdp, "telling LM to withdraw\n");
+	gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
+	fs_err(sdp, "withdrawn\n");
+	dump_stack();
+
+	return -1;
+}
+
+int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+		     void **lockp)
+{
+	int error = -EIO;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
+				sdp->sd_lockstruct.ls_lockspace, name, lockp);
+	return error;
+}
+
+void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock)
+{
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		sdp->sd_lockstruct.ls_ops->lm_put_lock(lock);
+}
+
+unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+			  unsigned int cur_state, unsigned int req_state,
+			  unsigned int flags)
+{
+	int ret = 0;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
+							 req_state, flags);
+	return ret;
+}
+
+unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
+			    unsigned int cur_state)
+{
+	int ret = 0;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		ret =  sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
+	return ret;
+}
+
+void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock)
+{
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		sdp->sd_lockstruct.ls_ops->lm_cancel(lock);
+}
+
+int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
+{
+	int error = -EIO;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
+	return error;
+}
+
+void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb)
+{
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb);
+}
+
+int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
+		      struct file *file, struct file_lock *fl)
+{
+	int error = -EIO;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
+				sdp->sd_lockstruct.ls_lockspace, name, file, fl);
+	return error;
+}
+
+int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+		  struct file *file, int cmd, struct file_lock *fl)
+{
+	int error = -EIO;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		error = sdp->sd_lockstruct.ls_ops->lm_plock(
+				sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
+	return error;
+}
+
+int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+		    struct file *file, struct file_lock *fl)
+{
+	int error = -EIO;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		error = sdp->sd_lockstruct.ls_ops->lm_punlock(
+				sdp->sd_lockstruct.ls_lockspace, name, file, fl);
+	return error;
+}
+
+void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
+			   unsigned int message)
+{
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		sdp->sd_lockstruct.ls_ops->lm_recovery_done(
+			sdp->sd_lockstruct.ls_lockspace, jid, message);
+}
+
diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h
new file mode 100644
index 000000000000..21cdc30ee08c
--- /dev/null
+++ b/fs/gfs2/lm.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __LM_DOT_H__
+#define __LM_DOT_H__
+
+struct gfs2_sbd;
+
+#define GFS2_MIN_LVB_SIZE 32
+
+int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent);
+void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp);
+void gfs2_lm_unmount(struct gfs2_sbd *sdp);
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
+				__attribute__ ((format(printf, 2, 3)));
+int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+		     void **lockp);
+void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock);
+unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+			 unsigned int cur_state, unsigned int req_state,
+			 unsigned int flags);
+unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
+			   unsigned int cur_state);
+void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock);
+int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp);
+void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb);
+int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
+		      struct file *file, struct file_lock *fl);
+int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+		  struct file *file, int cmd, struct file_lock *fl);
+int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
+		    struct file *file, struct file_lock *fl);
+void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
+			   unsigned int message);
+
+#endif /* __LM_DOT_H__ */
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
new file mode 100644
index 000000000000..663fee728783
--- /dev/null
+++ b/fs/gfs2/locking.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/kmod.h>
+#include <linux/fs.h>
+#include <linux/delay.h>
+#include <linux/lm_interface.h>
+
+struct lmh_wrapper {
+	struct list_head lw_list;
+	const struct lm_lockops *lw_ops;
+};
+
+/* List of registered low-level locking protocols.  A file system selects one
+   of them by name at mount time, e.g. lock_nolock, lock_dlm. */
+
+static LIST_HEAD(lmh_list);
+static DEFINE_MUTEX(lmh_lock);
+
+/**
+ * gfs2_register_lockproto - Register a low-level locking protocol
+ * @proto: the protocol definition
+ *
+ * Returns: 0 on success, -EXXX on failure
+ */
+
+int gfs2_register_lockproto(const struct lm_lockops *proto)
+{
+	struct lmh_wrapper *lw;
+
+	mutex_lock(&lmh_lock);
+
+	list_for_each_entry(lw, &lmh_list, lw_list) {
+		if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
+			mutex_unlock(&lmh_lock);
+			printk(KERN_INFO "GFS2: protocol %s already exists\n",
+			       proto->lm_proto_name);
+			return -EEXIST;
+		}
+	}
+
+	lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL);
+	if (!lw) {
+		mutex_unlock(&lmh_lock);
+		return -ENOMEM;
+	}
+
+	lw->lw_ops = proto;
+	list_add(&lw->lw_list, &lmh_list);
+
+	mutex_unlock(&lmh_lock);
+
+	return 0;
+}
+
+/**
+ * gfs2_unregister_lockproto - Unregister a low-level locking protocol
+ * @proto: the protocol definition
+ *
+ */
+
+void gfs2_unregister_lockproto(const struct lm_lockops *proto)
+{
+	struct lmh_wrapper *lw;
+
+	mutex_lock(&lmh_lock);
+
+	list_for_each_entry(lw, &lmh_list, lw_list) {
+		if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
+			list_del(&lw->lw_list);
+			mutex_unlock(&lmh_lock);
+			kfree(lw);
+			return;
+		}
+	}
+
+	mutex_unlock(&lmh_lock);
+
+	printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n",
+	       proto->lm_proto_name);
+}
+
+/**
+ * gfs2_mount_lockproto - Mount a lock protocol
+ * @proto_name - the name of the protocol
+ * @table_name - the name of the lock space
+ * @host_data - data specific to this host
+ * @cb - the callback to the code using the lock module
+ * @sdp - The GFS2 superblock
+ * @min_lvb_size - the mininum LVB size that the caller can deal with
+ * @flags - LM_MFLAG_*
+ * @lockstruct - a structure returned describing the mount
+ *
+ * Returns: 0 on success, -EXXX on failure
+ */
+
+int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
+			 lm_callback_t cb, void *cb_data,
+			 unsigned int min_lvb_size, int flags,
+			 struct lm_lockstruct *lockstruct,
+			 struct kobject *fskobj)
+{
+	struct lmh_wrapper *lw = NULL;
+	int try = 0;
+	int error, found;
+
+retry:
+	mutex_lock(&lmh_lock);
+
+	found = 0;
+	list_for_each_entry(lw, &lmh_list, lw_list) {
+		if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		if (!try && capable(CAP_SYS_MODULE)) {
+			try = 1;
+			mutex_unlock(&lmh_lock);
+			request_module(proto_name);
+			goto retry;
+		}
+		printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name);
+		error = -ENOENT;
+		goto out;
+	}
+
+	if (!try_module_get(lw->lw_ops->lm_owner)) {
+		try = 0;
+		mutex_unlock(&lmh_lock);
+		msleep(1000);
+		goto retry;
+	}
+
+	error = lw->lw_ops->lm_mount(table_name, host_data, cb, cb_data,
+				     min_lvb_size, flags, lockstruct, fskobj);
+	if (error)
+		module_put(lw->lw_ops->lm_owner);
+out:
+	mutex_unlock(&lmh_lock);
+	return error;
+}
+
+void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
+{
+	mutex_lock(&lmh_lock);
+	lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
+	if (lockstruct->ls_ops->lm_owner)
+		module_put(lockstruct->ls_ops->lm_owner);
+	mutex_unlock(&lmh_lock);
+}
+
+/**
+ * gfs2_withdraw_lockproto - abnormally unmount a lock module
+ * @lockstruct: the lockstruct passed into mount
+ *
+ */
+
+void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct)
+{
+	mutex_lock(&lmh_lock);
+	lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace);
+	if (lockstruct->ls_ops->lm_owner)
+		module_put(lockstruct->ls_ops->lm_owner);
+	mutex_unlock(&lmh_lock);
+}
+
+EXPORT_SYMBOL_GPL(gfs2_register_lockproto);
+EXPORT_SYMBOL_GPL(gfs2_unregister_lockproto);
+
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
new file mode 100644
index 000000000000..89b93b6b45cf
--- /dev/null
+++ b/fs/gfs2/locking/dlm/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
+lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o
+
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
new file mode 100644
index 000000000000..b167addf9fd1
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -0,0 +1,524 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include "lock_dlm.h"
+
+static char junk_lvb[GDLM_LVB_SIZE];
+
+static void queue_complete(struct gdlm_lock *lp)
+{
+	struct gdlm_ls *ls = lp->ls;
+
+	clear_bit(LFL_ACTIVE, &lp->flags);
+
+	spin_lock(&ls->async_lock);
+	list_add_tail(&lp->clist, &ls->complete);
+	spin_unlock(&ls->async_lock);
+	wake_up(&ls->thread_wait);
+}
+
+static inline void gdlm_ast(void *astarg)
+{
+	queue_complete(astarg);
+}
+
+static inline void gdlm_bast(void *astarg, int mode)
+{
+	struct gdlm_lock *lp = astarg;
+	struct gdlm_ls *ls = lp->ls;
+
+	if (!mode) {
+		printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
+			lp->lockname.ln_type,
+			(unsigned long long)lp->lockname.ln_number);
+		return;
+	}
+
+	spin_lock(&ls->async_lock);
+	if (!lp->bast_mode) {
+		list_add_tail(&lp->blist, &ls->blocking);
+		lp->bast_mode = mode;
+	} else if (lp->bast_mode < mode)
+		lp->bast_mode = mode;
+	spin_unlock(&ls->async_lock);
+	wake_up(&ls->thread_wait);
+}
+
+void gdlm_queue_delayed(struct gdlm_lock *lp)
+{
+	struct gdlm_ls *ls = lp->ls;
+
+	spin_lock(&ls->async_lock);
+	list_add_tail(&lp->delay_list, &ls->delayed);
+	spin_unlock(&ls->async_lock);
+}
+
+/* convert gfs lock-state to dlm lock-mode */
+
+static s16 make_mode(s16 lmstate)
+{
+	switch (lmstate) {
+	case LM_ST_UNLOCKED:
+		return DLM_LOCK_NL;
+	case LM_ST_EXCLUSIVE:
+		return DLM_LOCK_EX;
+	case LM_ST_DEFERRED:
+		return DLM_LOCK_CW;
+	case LM_ST_SHARED:
+		return DLM_LOCK_PR;
+	}
+	gdlm_assert(0, "unknown LM state %d", lmstate);
+	return -1;
+}
+
+/* convert dlm lock-mode to gfs lock-state */
+
+s16 gdlm_make_lmstate(s16 dlmmode)
+{
+	switch (dlmmode) {
+	case DLM_LOCK_IV:
+	case DLM_LOCK_NL:
+		return LM_ST_UNLOCKED;
+	case DLM_LOCK_EX:
+		return LM_ST_EXCLUSIVE;
+	case DLM_LOCK_CW:
+		return LM_ST_DEFERRED;
+	case DLM_LOCK_PR:
+		return LM_ST_SHARED;
+	}
+	gdlm_assert(0, "unknown DLM mode %d", dlmmode);
+	return -1;
+}
+
+/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
+   DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
+
+static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state)
+{
+	s16 cur = make_mode(cur_state);
+	if (lp->cur != DLM_LOCK_IV)
+		gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur);
+}
+
+static inline unsigned int make_flags(struct gdlm_lock *lp,
+				      unsigned int gfs_flags,
+				      s16 cur, s16 req)
+{
+	unsigned int lkf = 0;
+
+	if (gfs_flags & LM_FLAG_TRY)
+		lkf |= DLM_LKF_NOQUEUE;
+
+	if (gfs_flags & LM_FLAG_TRY_1CB) {
+		lkf |= DLM_LKF_NOQUEUE;
+		lkf |= DLM_LKF_NOQUEUEBAST;
+	}
+
+	if (gfs_flags & LM_FLAG_PRIORITY) {
+		lkf |= DLM_LKF_NOORDER;
+		lkf |= DLM_LKF_HEADQUE;
+	}
+
+	if (gfs_flags & LM_FLAG_ANY) {
+		if (req == DLM_LOCK_PR)
+			lkf |= DLM_LKF_ALTCW;
+		else if (req == DLM_LOCK_CW)
+			lkf |= DLM_LKF_ALTPR;
+	}
+
+	if (lp->lksb.sb_lkid != 0) {
+		lkf |= DLM_LKF_CONVERT;
+
+		/* Conversion deadlock avoidance by DLM */
+
+		if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
+		    !(lkf & DLM_LKF_NOQUEUE) &&
+		    cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
+			lkf |= DLM_LKF_CONVDEADLK;
+	}
+
+	if (lp->lvb)
+		lkf |= DLM_LKF_VALBLK;
+
+	return lkf;
+}
+
+/* make_strname - convert GFS lock numbers to a string */
+
+static inline void make_strname(struct lm_lockname *lockname,
+				struct gdlm_strname *str)
+{
+	sprintf(str->name, "%8x%16llx", lockname->ln_type,
+		(unsigned long long)lockname->ln_number);
+	str->namelen = GDLM_STRNAME_BYTES;
+}
+
+static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
+			  struct gdlm_lock **lpp)
+{
+	struct gdlm_lock *lp;
+
+	lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL);
+	if (!lp)
+		return -ENOMEM;
+
+	lp->lockname = *name;
+	lp->ls = ls;
+	lp->cur = DLM_LOCK_IV;
+	lp->lvb = NULL;
+	lp->hold_null = NULL;
+	init_completion(&lp->ast_wait);
+	INIT_LIST_HEAD(&lp->clist);
+	INIT_LIST_HEAD(&lp->blist);
+	INIT_LIST_HEAD(&lp->delay_list);
+
+	spin_lock(&ls->async_lock);
+	list_add(&lp->all_list, &ls->all_locks);
+	ls->all_locks_count++;
+	spin_unlock(&ls->async_lock);
+
+	*lpp = lp;
+	return 0;
+}
+
+void gdlm_delete_lp(struct gdlm_lock *lp)
+{
+	struct gdlm_ls *ls = lp->ls;
+
+	spin_lock(&ls->async_lock);
+	if (!list_empty(&lp->clist))
+		list_del_init(&lp->clist);
+	if (!list_empty(&lp->blist))
+		list_del_init(&lp->blist);
+	if (!list_empty(&lp->delay_list))
+		list_del_init(&lp->delay_list);
+	gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
+		    (unsigned long long)lp->lockname.ln_number);
+	list_del_init(&lp->all_list);
+	ls->all_locks_count--;
+	spin_unlock(&ls->async_lock);
+
+	kfree(lp);
+}
+
+int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
+		  void **lockp)
+{
+	struct gdlm_lock *lp;
+	int error;
+
+	error = gdlm_create_lp(lockspace, name, &lp);
+
+	*lockp = lp;
+	return error;
+}
+
+void gdlm_put_lock(void *lock)
+{
+	gdlm_delete_lp(lock);
+}
+
+unsigned int gdlm_do_lock(struct gdlm_lock *lp)
+{
+	struct gdlm_ls *ls = lp->ls;
+	struct gdlm_strname str;
+	int error, bast = 1;
+
+	/*
+	 * When recovery is in progress, delay lock requests for submission
+	 * once recovery is done.  Requests for recovery (NOEXP) and unlocks
+	 * can pass.
+	 */
+
+	if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+	    !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) {
+		gdlm_queue_delayed(lp);
+		return LM_OUT_ASYNC;
+	}
+
+	/*
+	 * Submit the actual lock request.
+	 */
+
+	if (test_bit(LFL_NOBAST, &lp->flags))
+		bast = 0;
+
+	make_strname(&lp->lockname, &str);
+
+	set_bit(LFL_ACTIVE, &lp->flags);
+
+	log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
+		  (unsigned long long)lp->lockname.ln_number, lp->lksb.sb_lkid,
+		  lp->cur, lp->req, lp->lkf);
+
+	error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
+			 str.name, str.namelen, 0, gdlm_ast, lp,
+			 bast ? gdlm_bast : NULL);
+
+	if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
+		lp->lksb.sb_status = -EAGAIN;
+		queue_complete(lp);
+		error = 0;
+	}
+
+	if (error) {
+		log_debug("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
+			  "flags=%lx", ls->fsname, lp->lockname.ln_type,
+			  (unsigned long long)lp->lockname.ln_number, error,
+			  lp->cur, lp->req, lp->lkf, lp->flags);
+		return LM_OUT_ERROR;
+	}
+	return LM_OUT_ASYNC;
+}
+
+static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
+{
+	struct gdlm_ls *ls = lp->ls;
+	unsigned int lkf = 0;
+	int error;
+
+	set_bit(LFL_DLM_UNLOCK, &lp->flags);
+	set_bit(LFL_ACTIVE, &lp->flags);
+
+	if (lp->lvb)
+		lkf = DLM_LKF_VALBLK;
+
+	log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type,
+		  (unsigned long long)lp->lockname.ln_number,
+		  lp->lksb.sb_lkid, lp->cur, lkf);
+
+	error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
+
+	if (error) {
+		log_debug("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
+			  "flags=%lx", ls->fsname, lp->lockname.ln_type,
+			  (unsigned long long)lp->lockname.ln_number, error,
+			  lp->cur, lp->req, lp->lkf, lp->flags);
+		return LM_OUT_ERROR;
+	}
+	return LM_OUT_ASYNC;
+}
+
+unsigned int gdlm_lock(void *lock, unsigned int cur_state,
+		       unsigned int req_state, unsigned int flags)
+{
+	struct gdlm_lock *lp = lock;
+
+	clear_bit(LFL_DLM_CANCEL, &lp->flags);
+	if (flags & LM_FLAG_NOEXP)
+		set_bit(LFL_NOBLOCK, &lp->flags);
+
+	check_cur_state(lp, cur_state);
+	lp->req = make_mode(req_state);
+	lp->lkf = make_flags(lp, flags, lp->cur, lp->req);
+
+	return gdlm_do_lock(lp);
+}
+
+unsigned int gdlm_unlock(void *lock, unsigned int cur_state)
+{
+	struct gdlm_lock *lp = lock;
+
+	clear_bit(LFL_DLM_CANCEL, &lp->flags);
+	if (lp->cur == DLM_LOCK_IV)
+		return 0;
+	return gdlm_do_unlock(lp);
+}
+
+void gdlm_cancel(void *lock)
+{
+	struct gdlm_lock *lp = lock;
+	struct gdlm_ls *ls = lp->ls;
+	int error, delay_list = 0;
+
+	if (test_bit(LFL_DLM_CANCEL, &lp->flags))
+		return;
+
+	log_info("gdlm_cancel %x,%llx flags %lx", lp->lockname.ln_type,
+		 (unsigned long long)lp->lockname.ln_number, lp->flags);
+
+	spin_lock(&ls->async_lock);
+	if (!list_empty(&lp->delay_list)) {
+		list_del_init(&lp->delay_list);
+		delay_list = 1;
+	}
+	spin_unlock(&ls->async_lock);
+
+	if (delay_list) {
+		set_bit(LFL_CANCEL, &lp->flags);
+		set_bit(LFL_ACTIVE, &lp->flags);
+		queue_complete(lp);
+		return;
+	}
+
+	if (!test_bit(LFL_ACTIVE, &lp->flags) ||
+	    test_bit(LFL_DLM_UNLOCK, &lp->flags)) {
+		log_info("gdlm_cancel skip %x,%llx flags %lx",
+		 	 lp->lockname.ln_type,
+			 (unsigned long long)lp->lockname.ln_number, lp->flags);
+		return;
+	}
+
+	/* the lock is blocked in the dlm */
+
+	set_bit(LFL_DLM_CANCEL, &lp->flags);
+	set_bit(LFL_ACTIVE, &lp->flags);
+
+	error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL,
+			   NULL, lp);
+
+	log_info("gdlm_cancel rv %d %x,%llx flags %lx", error,
+		 lp->lockname.ln_type,
+		 (unsigned long long)lp->lockname.ln_number, lp->flags);
+
+	if (error == -EBUSY)
+		clear_bit(LFL_DLM_CANCEL, &lp->flags);
+}
+
+static int gdlm_add_lvb(struct gdlm_lock *lp)
+{
+	char *lvb;
+
+	lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL);
+	if (!lvb)
+		return -ENOMEM;
+
+	lp->lksb.sb_lvbptr = lvb;
+	lp->lvb = lvb;
+	return 0;
+}
+
+static void gdlm_del_lvb(struct gdlm_lock *lp)
+{
+	kfree(lp->lvb);
+	lp->lvb = NULL;
+	lp->lksb.sb_lvbptr = NULL;
+}
+
+/* This can do a synchronous dlm request (requiring a lock_dlm thread to get
+   the completion) because gfs won't call hold_lvb() during a callback (from
+   the context of a lock_dlm thread). */
+
+static int hold_null_lock(struct gdlm_lock *lp)
+{
+	struct gdlm_lock *lpn = NULL;
+	int error;
+
+	if (lp->hold_null) {
+		printk(KERN_INFO "lock_dlm: lvb already held\n");
+		return 0;
+	}
+
+	error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn);
+	if (error)
+		goto out;
+
+	lpn->lksb.sb_lvbptr = junk_lvb;
+	lpn->lvb = junk_lvb;
+
+	lpn->req = DLM_LOCK_NL;
+	lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE;
+	set_bit(LFL_NOBAST, &lpn->flags);
+	set_bit(LFL_INLOCK, &lpn->flags);
+
+	init_completion(&lpn->ast_wait);
+	gdlm_do_lock(lpn);
+	wait_for_completion(&lpn->ast_wait);
+	error = lpn->lksb.sb_status;
+	if (error) {
+		printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n",
+		       error);
+		gdlm_delete_lp(lpn);
+		lpn = NULL;
+	}
+out:
+	lp->hold_null = lpn;
+	return error;
+}
+
+/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get
+   the completion) because gfs may call unhold_lvb() during a callback (from
+   the context of a lock_dlm thread) which could cause a deadlock since the
+   other lock_dlm thread could be engaged in recovery. */
+
+static void unhold_null_lock(struct gdlm_lock *lp)
+{
+	struct gdlm_lock *lpn = lp->hold_null;
+
+	gdlm_assert(lpn, "%x,%llx", lp->lockname.ln_type,
+		    (unsigned long long)lp->lockname.ln_number);
+	lpn->lksb.sb_lvbptr = NULL;
+	lpn->lvb = NULL;
+	set_bit(LFL_UNLOCK_DELETE, &lpn->flags);
+	gdlm_do_unlock(lpn);
+	lp->hold_null = NULL;
+}
+
+/* Acquire a NL lock because gfs requires the value block to remain
+   intact on the resource while the lvb is "held" even if it's holding no locks
+   on the resource. */
+
+int gdlm_hold_lvb(void *lock, char **lvbp)
+{
+	struct gdlm_lock *lp = lock;
+	int error;
+
+	error = gdlm_add_lvb(lp);
+	if (error)
+		return error;
+
+	*lvbp = lp->lvb;
+
+	error = hold_null_lock(lp);
+	if (error)
+		gdlm_del_lvb(lp);
+
+	return error;
+}
+
+void gdlm_unhold_lvb(void *lock, char *lvb)
+{
+	struct gdlm_lock *lp = lock;
+
+	unhold_null_lock(lp);
+	gdlm_del_lvb(lp);
+}
+
+void gdlm_submit_delayed(struct gdlm_ls *ls)
+{
+	struct gdlm_lock *lp, *safe;
+
+	spin_lock(&ls->async_lock);
+	list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) {
+		list_del_init(&lp->delay_list);
+		list_add_tail(&lp->delay_list, &ls->submit);
+	}
+	spin_unlock(&ls->async_lock);
+	wake_up(&ls->thread_wait);
+}
+
+int gdlm_release_all_locks(struct gdlm_ls *ls)
+{
+	struct gdlm_lock *lp, *safe;
+	int count = 0;
+
+	spin_lock(&ls->async_lock);
+	list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
+		list_del_init(&lp->all_list);
+
+		if (lp->lvb && lp->lvb != junk_lvb)
+			kfree(lp->lvb);
+		kfree(lp);
+		count++;
+	}
+	spin_unlock(&ls->async_lock);
+
+	return count;
+}
+
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
new file mode 100644
index 000000000000..33af707a4d3f
--- /dev/null
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -0,0 +1,187 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef LOCK_DLM_DOT_H
+#define LOCK_DLM_DOT_H
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/socket.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/kobject.h>
+#include <linux/fcntl.h>
+#include <linux/wait.h>
+#include <net/sock.h>
+
+#include <linux/dlm.h>
+#include <linux/lm_interface.h>
+
+/*
+ * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
+ * prefix of lock_dlm_ gets awkward.  Externally, GFS refers to this module
+ * as "lock_dlm".
+ */
+
+#define GDLM_STRNAME_BYTES	24
+#define GDLM_LVB_SIZE		32
+#define GDLM_DROP_COUNT		50000
+#define GDLM_DROP_PERIOD	60
+#define GDLM_NAME_LEN		128
+
+/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number).
+   We sprintf these numbers into a 24 byte string of hex values to make them
+   human-readable (to make debugging simpler.) */
+
+struct gdlm_strname {
+	unsigned char		name[GDLM_STRNAME_BYTES];
+	unsigned short		namelen;
+};
+
+enum {
+	DFL_BLOCK_LOCKS		= 0,
+	DFL_SPECTATOR		= 1,
+	DFL_WITHDRAW		= 2,
+};
+
+struct gdlm_ls {
+	u32		id;
+	int			jid;
+	int			first;
+	int			first_done;
+	unsigned long		flags;
+	struct kobject		kobj;
+	char			clustername[GDLM_NAME_LEN];
+	char			fsname[GDLM_NAME_LEN];
+	int			fsflags;
+	dlm_lockspace_t		*dlm_lockspace;
+	lm_callback_t		fscb;
+	struct gfs2_sbd		*sdp;
+	int			recover_jid;
+	int			recover_jid_done;
+	int			recover_jid_status;
+	spinlock_t		async_lock;
+	struct list_head	complete;
+	struct list_head	blocking;
+	struct list_head	delayed;
+	struct list_head	submit;
+	struct list_head	all_locks;
+	u32		all_locks_count;
+	wait_queue_head_t	wait_control;
+	struct task_struct	*thread1;
+	struct task_struct	*thread2;
+	wait_queue_head_t	thread_wait;
+	unsigned long		drop_time;
+	int			drop_locks_count;
+	int			drop_locks_period;
+};
+
+enum {
+	LFL_NOBLOCK		= 0,
+	LFL_NOCACHE		= 1,
+	LFL_DLM_UNLOCK		= 2,
+	LFL_DLM_CANCEL		= 3,
+	LFL_SYNC_LVB		= 4,
+	LFL_FORCE_PROMOTE	= 5,
+	LFL_REREQUEST		= 6,
+	LFL_ACTIVE		= 7,
+	LFL_INLOCK		= 8,
+	LFL_CANCEL		= 9,
+	LFL_NOBAST		= 10,
+	LFL_HEADQUE		= 11,
+	LFL_UNLOCK_DELETE	= 12,
+};
+
+struct gdlm_lock {
+	struct gdlm_ls		*ls;
+	struct lm_lockname	lockname;
+	char			*lvb;
+	struct dlm_lksb		lksb;
+
+	s16			cur;
+	s16			req;
+	s16			prev_req;
+	u32			lkf;		/* dlm flags DLM_LKF_ */
+	unsigned long		flags;		/* lock_dlm flags LFL_ */
+
+	int			bast_mode;	/* protected by async_lock */
+	struct completion	ast_wait;
+
+	struct list_head	clist;		/* complete */
+	struct list_head	blist;		/* blocking */
+	struct list_head	delay_list;	/* delayed */
+	struct list_head	all_list;	/* all locks for the fs */
+	struct gdlm_lock	*hold_null;	/* NL lock for hold_lvb */
+};
+
+#define gdlm_assert(assertion, fmt, args...)                                  \
+do {                                                                          \
+	if (unlikely(!(assertion))) {                                         \
+		printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \
+				  "lock_dlm:  " fmt "\n",                     \
+				  #assertion, ##args);                        \
+		BUG();                                                        \
+	}                                                                     \
+} while (0)
+
+#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg)
+#define log_info(fmt, arg...)  log_print(KERN_INFO , fmt , ## arg)
+#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg)
+#ifdef LOCK_DLM_LOG_DEBUG
+#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg)
+#else
+#define log_debug(fmt, arg...)
+#endif
+
+/* sysfs.c */
+
+int gdlm_sysfs_init(void);
+void gdlm_sysfs_exit(void);
+int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *);
+void gdlm_kobject_release(struct gdlm_ls *);
+
+/* thread.c */
+
+int gdlm_init_threads(struct gdlm_ls *);
+void gdlm_release_threads(struct gdlm_ls *);
+
+/* lock.c */
+
+s16 gdlm_make_lmstate(s16);
+void gdlm_queue_delayed(struct gdlm_lock *);
+void gdlm_submit_delayed(struct gdlm_ls *);
+int gdlm_release_all_locks(struct gdlm_ls *);
+void gdlm_delete_lp(struct gdlm_lock *);
+unsigned int gdlm_do_lock(struct gdlm_lock *);
+
+int gdlm_get_lock(void *, struct lm_lockname *, void **);
+void gdlm_put_lock(void *);
+unsigned int gdlm_lock(void *, unsigned int, unsigned int, unsigned int);
+unsigned int gdlm_unlock(void *, unsigned int);
+void gdlm_cancel(void *);
+int gdlm_hold_lvb(void *, char **);
+void gdlm_unhold_lvb(void *, char *);
+
+/* plock.c */
+
+int gdlm_plock_init(void);
+void gdlm_plock_exit(void);
+int gdlm_plock(void *, struct lm_lockname *, struct file *, int,
+		struct file_lock *);
+int gdlm_plock_get(void *, struct lm_lockname *, struct file *,
+		struct file_lock *);
+int gdlm_punlock(void *, struct lm_lockname *, struct file *,
+		struct file_lock *);
+#endif
+
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
new file mode 100644
index 000000000000..2194b1d5b5ec
--- /dev/null
+++ b/fs/gfs2/locking/dlm/main.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/init.h>
+
+#include "lock_dlm.h"
+
+extern int gdlm_drop_count;
+extern int gdlm_drop_period;
+
+extern struct lm_lockops gdlm_ops;
+
+static int __init init_lock_dlm(void)
+{
+	int error;
+
+	error = gfs2_register_lockproto(&gdlm_ops);
+	if (error) {
+		printk(KERN_WARNING "lock_dlm:  can't register protocol: %d\n",
+		       error);
+		return error;
+	}
+
+	error = gdlm_sysfs_init();
+	if (error) {
+		gfs2_unregister_lockproto(&gdlm_ops);
+		return error;
+	}
+
+	error = gdlm_plock_init();
+	if (error) {
+		gdlm_sysfs_exit();
+		gfs2_unregister_lockproto(&gdlm_ops);
+		return error;
+	}
+
+	gdlm_drop_count = GDLM_DROP_COUNT;
+	gdlm_drop_period = GDLM_DROP_PERIOD;
+
+	printk(KERN_INFO
+	       "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
+	return 0;
+}
+
+static void __exit exit_lock_dlm(void)
+{
+	gdlm_plock_exit();
+	gdlm_sysfs_exit();
+	gfs2_unregister_lockproto(&gdlm_ops);
+}
+
+module_init(init_lock_dlm);
+module_exit(exit_lock_dlm);
+
+MODULE_DESCRIPTION("GFS DLM Locking Module");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
new file mode 100644
index 000000000000..cdd1694e889b
--- /dev/null
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -0,0 +1,255 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include "lock_dlm.h"
+
+int gdlm_drop_count;
+int gdlm_drop_period;
+const struct lm_lockops gdlm_ops;
+
+
+static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
+				 int flags, char *table_name)
+{
+	struct gdlm_ls *ls;
+	char buf[256], *p;
+
+	ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL);
+	if (!ls)
+		return NULL;
+
+	ls->drop_locks_count = gdlm_drop_count;
+	ls->drop_locks_period = gdlm_drop_period;
+	ls->fscb = cb;
+	ls->sdp = sdp;
+	ls->fsflags = flags;
+	spin_lock_init(&ls->async_lock);
+	INIT_LIST_HEAD(&ls->complete);
+	INIT_LIST_HEAD(&ls->blocking);
+	INIT_LIST_HEAD(&ls->delayed);
+	INIT_LIST_HEAD(&ls->submit);
+	INIT_LIST_HEAD(&ls->all_locks);
+	init_waitqueue_head(&ls->thread_wait);
+	init_waitqueue_head(&ls->wait_control);
+	ls->thread1 = NULL;
+	ls->thread2 = NULL;
+	ls->drop_time = jiffies;
+	ls->jid = -1;
+
+	strncpy(buf, table_name, 256);
+	buf[255] = '\0';
+
+	p = strchr(buf, ':');
+	if (!p) {
+		log_info("invalid table_name \"%s\"", table_name);
+		kfree(ls);
+		return NULL;
+	}
+	*p = '\0';
+	p++;
+
+	strncpy(ls->clustername, buf, GDLM_NAME_LEN);
+	strncpy(ls->fsname, p, GDLM_NAME_LEN);
+
+	return ls;
+}
+
+static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
+{
+	char data[256];
+	char *options, *x, *y;
+	int error = 0;
+
+	memset(data, 0, 256);
+	strncpy(data, data_arg, 255);
+
+	for (options = data; (x = strsep(&options, ":")); ) {
+		if (!*x)
+			continue;
+
+		y = strchr(x, '=');
+		if (y)
+			*y++ = 0;
+
+		if (!strcmp(x, "jid")) {
+			if (!y) {
+				log_error("need argument to jid");
+				error = -EINVAL;
+				break;
+			}
+			sscanf(y, "%u", &ls->jid);
+
+		} else if (!strcmp(x, "first")) {
+			if (!y) {
+				log_error("need argument to first");
+				error = -EINVAL;
+				break;
+			}
+			sscanf(y, "%u", &ls->first);
+
+		} else if (!strcmp(x, "id")) {
+			if (!y) {
+				log_error("need argument to id");
+				error = -EINVAL;
+				break;
+			}
+			sscanf(y, "%u", &ls->id);
+
+		} else if (!strcmp(x, "nodir")) {
+			if (!y) {
+				log_error("need argument to nodir");
+				error = -EINVAL;
+				break;
+			}
+			sscanf(y, "%u", nodir);
+
+		} else {
+			log_error("unkonwn option: %s", x);
+			error = -EINVAL;
+			break;
+		}
+	}
+
+	return error;
+}
+
+static int gdlm_mount(char *table_name, char *host_data,
+			lm_callback_t cb, void *cb_data,
+			unsigned int min_lvb_size, int flags,
+			struct lm_lockstruct *lockstruct,
+			struct kobject *fskobj)
+{
+	struct gdlm_ls *ls;
+	int error = -ENOMEM, nodir = 0;
+
+	if (min_lvb_size > GDLM_LVB_SIZE)
+		goto out;
+
+	ls = init_gdlm(cb, cb_data, flags, table_name);
+	if (!ls)
+		goto out;
+
+	error = make_args(ls, host_data, &nodir);
+	if (error)
+		goto out;
+
+	error = gdlm_init_threads(ls);
+	if (error)
+		goto out_free;
+
+	error = gdlm_kobject_setup(ls, fskobj);
+	if (error)
+		goto out_thread;
+
+	error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
+				  &ls->dlm_lockspace,
+				  nodir ? DLM_LSFL_NODIR : 0,
+				  GDLM_LVB_SIZE);
+	if (error) {
+		log_error("dlm_new_lockspace error %d", error);
+		goto out_kobj;
+	}
+
+	lockstruct->ls_jid = ls->jid;
+	lockstruct->ls_first = ls->first;
+	lockstruct->ls_lockspace = ls;
+	lockstruct->ls_ops = &gdlm_ops;
+	lockstruct->ls_flags = 0;
+	lockstruct->ls_lvb_size = GDLM_LVB_SIZE;
+	return 0;
+
+out_kobj:
+	gdlm_kobject_release(ls);
+out_thread:
+	gdlm_release_threads(ls);
+out_free:
+	kfree(ls);
+out:
+	return error;
+}
+
+static void gdlm_unmount(void *lockspace)
+{
+	struct gdlm_ls *ls = lockspace;
+	int rv;
+
+	log_debug("unmount flags %lx", ls->flags);
+
+	/* FIXME: serialize unmount and withdraw in case they
+	   happen at once.  Also, if unmount follows withdraw,
+	   wait for withdraw to finish. */
+
+	if (test_bit(DFL_WITHDRAW, &ls->flags))
+		goto out;
+
+	gdlm_kobject_release(ls);
+	dlm_release_lockspace(ls->dlm_lockspace, 2);
+	gdlm_release_threads(ls);
+	rv = gdlm_release_all_locks(ls);
+	if (rv)
+		log_info("gdlm_unmount: %d stray locks freed", rv);
+out:
+	kfree(ls);
+}
+
+static void gdlm_recovery_done(void *lockspace, unsigned int jid,
+                               unsigned int message)
+{
+	struct gdlm_ls *ls = lockspace;
+	ls->recover_jid_done = jid;
+	ls->recover_jid_status = message;
+	kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+}
+
+static void gdlm_others_may_mount(void *lockspace)
+{
+	struct gdlm_ls *ls = lockspace;
+	ls->first_done = 1;
+	kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+}
+
+/* Userspace gets the offline uevent, blocks new gfs locks on
+   other mounters, and lets us know (sets WITHDRAW flag).  Then,
+   userspace leaves the mount group while we leave the lockspace. */
+
+static void gdlm_withdraw(void *lockspace)
+{
+	struct gdlm_ls *ls = lockspace;
+
+	kobject_uevent(&ls->kobj, KOBJ_OFFLINE);
+
+	wait_event_interruptible(ls->wait_control,
+				 test_bit(DFL_WITHDRAW, &ls->flags));
+
+	dlm_release_lockspace(ls->dlm_lockspace, 2);
+	gdlm_release_threads(ls);
+	gdlm_release_all_locks(ls);
+	gdlm_kobject_release(ls);
+}
+
+const struct lm_lockops gdlm_ops = {
+	.lm_proto_name = "lock_dlm",
+	.lm_mount = gdlm_mount,
+	.lm_others_may_mount = gdlm_others_may_mount,
+	.lm_unmount = gdlm_unmount,
+	.lm_withdraw = gdlm_withdraw,
+	.lm_get_lock = gdlm_get_lock,
+	.lm_put_lock = gdlm_put_lock,
+	.lm_lock = gdlm_lock,
+	.lm_unlock = gdlm_unlock,
+	.lm_plock = gdlm_plock,
+	.lm_punlock = gdlm_punlock,
+	.lm_plock_get = gdlm_plock_get,
+	.lm_cancel = gdlm_cancel,
+	.lm_hold_lvb = gdlm_hold_lvb,
+	.lm_unhold_lvb = gdlm_unhold_lvb,
+	.lm_recovery_done = gdlm_recovery_done,
+	.lm_owner = THIS_MODULE,
+};
+
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
new file mode 100644
index 000000000000..7365aec9511b
--- /dev/null
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/lock_dlm_plock.h>
+
+#include "lock_dlm.h"
+
+
+static spinlock_t ops_lock;
+static struct list_head send_list;
+static struct list_head recv_list;
+static wait_queue_head_t send_wq;
+static wait_queue_head_t recv_wq;
+
+struct plock_op {
+	struct list_head list;
+	int done;
+	struct gdlm_plock_info info;
+};
+
+static inline void set_version(struct gdlm_plock_info *info)
+{
+	info->version[0] = GDLM_PLOCK_VERSION_MAJOR;
+	info->version[1] = GDLM_PLOCK_VERSION_MINOR;
+	info->version[2] = GDLM_PLOCK_VERSION_PATCH;
+}
+
+static int check_version(struct gdlm_plock_info *info)
+{
+	if ((GDLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
+	    (GDLM_PLOCK_VERSION_MINOR < info->version[1])) {
+		log_error("plock device version mismatch: "
+			  "kernel (%u.%u.%u), user (%u.%u.%u)",
+			  GDLM_PLOCK_VERSION_MAJOR,
+			  GDLM_PLOCK_VERSION_MINOR,
+			  GDLM_PLOCK_VERSION_PATCH,
+			  info->version[0],
+			  info->version[1],
+			  info->version[2]);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void send_op(struct plock_op *op)
+{
+	set_version(&op->info);
+	INIT_LIST_HEAD(&op->list);
+	spin_lock(&ops_lock);
+	list_add_tail(&op->list, &send_list);
+	spin_unlock(&ops_lock);
+	wake_up(&send_wq);
+}
+
+int gdlm_plock(void *lockspace, struct lm_lockname *name,
+	       struct file *file, int cmd, struct file_lock *fl)
+{
+	struct gdlm_ls *ls = lockspace;
+	struct plock_op *op;
+	int rv;
+
+	op = kzalloc(sizeof(*op), GFP_KERNEL);
+	if (!op)
+		return -ENOMEM;
+
+	op->info.optype		= GDLM_PLOCK_OP_LOCK;
+	op->info.pid		= fl->fl_pid;
+	op->info.ex		= (fl->fl_type == F_WRLCK);
+	op->info.wait		= IS_SETLKW(cmd);
+	op->info.fsid		= ls->id;
+	op->info.number		= name->ln_number;
+	op->info.start		= fl->fl_start;
+	op->info.end		= fl->fl_end;
+	op->info.owner		= (__u64)(long) fl->fl_owner;
+
+	send_op(op);
+	wait_event(recv_wq, (op->done != 0));
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&op->list)) {
+		printk(KERN_INFO "plock op on list\n");
+		list_del(&op->list);
+	}
+	spin_unlock(&ops_lock);
+
+	rv = op->info.rv;
+
+	if (!rv) {
+		if (posix_lock_file_wait(file, fl) < 0)
+			log_error("gdlm_plock: vfs lock error %x,%llx",
+				  name->ln_type,
+				  (unsigned long long)name->ln_number);
+	}
+
+	kfree(op);
+	return rv;
+}
+
+int gdlm_punlock(void *lockspace, struct lm_lockname *name,
+		 struct file *file, struct file_lock *fl)
+{
+	struct gdlm_ls *ls = lockspace;
+	struct plock_op *op;
+	int rv;
+
+	op = kzalloc(sizeof(*op), GFP_KERNEL);
+	if (!op)
+		return -ENOMEM;
+
+	if (posix_lock_file_wait(file, fl) < 0)
+		log_error("gdlm_punlock: vfs unlock error %x,%llx",
+			  name->ln_type, (unsigned long long)name->ln_number);
+
+	op->info.optype		= GDLM_PLOCK_OP_UNLOCK;
+	op->info.pid		= fl->fl_pid;
+	op->info.fsid		= ls->id;
+	op->info.number		= name->ln_number;
+	op->info.start		= fl->fl_start;
+	op->info.end		= fl->fl_end;
+	op->info.owner		= (__u64)(long) fl->fl_owner;
+
+	send_op(op);
+	wait_event(recv_wq, (op->done != 0));
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&op->list)) {
+		printk(KERN_INFO "punlock op on list\n");
+		list_del(&op->list);
+	}
+	spin_unlock(&ops_lock);
+
+	rv = op->info.rv;
+
+	kfree(op);
+	return rv;
+}
+
+int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
+		   struct file *file, struct file_lock *fl)
+{
+	struct gdlm_ls *ls = lockspace;
+	struct plock_op *op;
+	int rv;
+
+	op = kzalloc(sizeof(*op), GFP_KERNEL);
+	if (!op)
+		return -ENOMEM;
+
+	op->info.optype		= GDLM_PLOCK_OP_GET;
+	op->info.pid		= fl->fl_pid;
+	op->info.ex		= (fl->fl_type == F_WRLCK);
+	op->info.fsid		= ls->id;
+	op->info.number		= name->ln_number;
+	op->info.start		= fl->fl_start;
+	op->info.end		= fl->fl_end;
+
+	send_op(op);
+	wait_event(recv_wq, (op->done != 0));
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&op->list)) {
+		printk(KERN_INFO "plock_get op on list\n");
+		list_del(&op->list);
+	}
+	spin_unlock(&ops_lock);
+
+	rv = op->info.rv;
+
+	if (rv == 0)
+		fl->fl_type = F_UNLCK;
+	else if (rv > 0) {
+		fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
+		fl->fl_pid = op->info.pid;
+		fl->fl_start = op->info.start;
+		fl->fl_end = op->info.end;
+	}
+
+	kfree(op);
+	return rv;
+}
+
+/* a read copies out one plock request from the send list */
+static ssize_t dev_read(struct file *file, char __user *u, size_t count,
+			loff_t *ppos)
+{
+	struct gdlm_plock_info info;
+	struct plock_op *op = NULL;
+
+	if (count < sizeof(info))
+		return -EINVAL;
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&send_list)) {
+		op = list_entry(send_list.next, struct plock_op, list);
+		list_move(&op->list, &recv_list);
+		memcpy(&info, &op->info, sizeof(info));
+	}
+	spin_unlock(&ops_lock);
+
+	if (!op)
+		return -EAGAIN;
+
+	if (copy_to_user(u, &info, sizeof(info)))
+		return -EFAULT;
+	return sizeof(info);
+}
+
+/* a write copies in one plock result that should match a plock_op
+   on the recv list */
+static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
+			 loff_t *ppos)
+{
+	struct gdlm_plock_info info;
+	struct plock_op *op;
+	int found = 0;
+
+	if (count != sizeof(info))
+		return -EINVAL;
+
+	if (copy_from_user(&info, u, sizeof(info)))
+		return -EFAULT;
+
+	if (check_version(&info))
+		return -EINVAL;
+
+	spin_lock(&ops_lock);
+	list_for_each_entry(op, &recv_list, list) {
+		if (op->info.fsid == info.fsid && op->info.number == info.number &&
+		    op->info.owner == info.owner) {
+			list_del_init(&op->list);
+			found = 1;
+			op->done = 1;
+			memcpy(&op->info, &info, sizeof(info));
+			break;
+		}
+	}
+	spin_unlock(&ops_lock);
+
+	if (found)
+		wake_up(&recv_wq);
+	else
+		printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid,
+			(unsigned long long)info.number);
+	return count;
+}
+
+static unsigned int dev_poll(struct file *file, poll_table *wait)
+{
+	poll_wait(file, &send_wq, wait);
+
+	spin_lock(&ops_lock);
+	if (!list_empty(&send_list)) {
+		spin_unlock(&ops_lock);
+		return POLLIN | POLLRDNORM;
+	}
+	spin_unlock(&ops_lock);
+	return 0;
+}
+
+static struct file_operations dev_fops = {
+	.read    = dev_read,
+	.write   = dev_write,
+	.poll    = dev_poll,
+	.owner   = THIS_MODULE
+};
+
+static struct miscdevice plock_dev_misc = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = GDLM_PLOCK_MISC_NAME,
+	.fops = &dev_fops
+};
+
+int gdlm_plock_init(void)
+{
+	int rv;
+
+	spin_lock_init(&ops_lock);
+	INIT_LIST_HEAD(&send_list);
+	INIT_LIST_HEAD(&recv_list);
+	init_waitqueue_head(&send_wq);
+	init_waitqueue_head(&recv_wq);
+
+	rv = misc_register(&plock_dev_misc);
+	if (rv)
+		printk(KERN_INFO "gdlm_plock_init: misc_register failed %d",
+		       rv);
+	return rv;
+}
+
+void gdlm_plock_exit(void)
+{
+	if (misc_deregister(&plock_dev_misc) < 0)
+		printk(KERN_INFO "gdlm_plock_exit: misc_deregister failed");
+}
+
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
new file mode 100644
index 000000000000..29ae06f94944
--- /dev/null
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/ctype.h>
+#include <linux/stat.h>
+
+#include "lock_dlm.h"
+
+extern struct lm_lockops gdlm_ops;
+
+static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
+{
+	return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
+}
+
+static ssize_t block_show(struct gdlm_ls *ls, char *buf)
+{
+	ssize_t ret;
+	int val = 0;
+
+	if (test_bit(DFL_BLOCK_LOCKS, &ls->flags))
+		val = 1;
+	ret = sprintf(buf, "%d\n", val);
+	return ret;
+}
+
+static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len)
+{
+	ssize_t ret = len;
+	int val;
+
+	val = simple_strtol(buf, NULL, 0);
+
+	if (val == 1)
+		set_bit(DFL_BLOCK_LOCKS, &ls->flags);
+	else if (val == 0) {
+		clear_bit(DFL_BLOCK_LOCKS, &ls->flags);
+		gdlm_submit_delayed(ls);
+	} else {
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf)
+{
+	ssize_t ret;
+	int val = 0;
+
+	if (test_bit(DFL_WITHDRAW, &ls->flags))
+		val = 1;
+	ret = sprintf(buf, "%d\n", val);
+	return ret;
+}
+
+static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len)
+{
+	ssize_t ret = len;
+	int val;
+
+	val = simple_strtol(buf, NULL, 0);
+
+	if (val == 1)
+		set_bit(DFL_WITHDRAW, &ls->flags);
+	else
+		ret = -EINVAL;
+	wake_up(&ls->wait_control);
+	return ret;
+}
+
+static ssize_t id_show(struct gdlm_ls *ls, char *buf)
+{
+	return sprintf(buf, "%u\n", ls->id);
+}
+
+static ssize_t jid_show(struct gdlm_ls *ls, char *buf)
+{
+	return sprintf(buf, "%d\n", ls->jid);
+}
+
+static ssize_t first_show(struct gdlm_ls *ls, char *buf)
+{
+	return sprintf(buf, "%d\n", ls->first);
+}
+
+static ssize_t first_done_show(struct gdlm_ls *ls, char *buf)
+{
+	return sprintf(buf, "%d\n", ls->first_done);
+}
+
+static ssize_t recover_show(struct gdlm_ls *ls, char *buf)
+{
+	return sprintf(buf, "%d\n", ls->recover_jid);
+}
+
+static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len)
+{
+	ls->recover_jid = simple_strtol(buf, NULL, 0);
+	ls->fscb(ls->sdp, LM_CB_NEED_RECOVERY, &ls->recover_jid);
+	return len;
+}
+
+static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf)
+{
+	return sprintf(buf, "%d\n", ls->recover_jid_done);
+}
+
+static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
+{
+	return sprintf(buf, "%d\n", ls->recover_jid_status);
+}
+
+struct gdlm_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct gdlm_ls *, char *);
+	ssize_t (*store)(struct gdlm_ls *, const char *, size_t);
+};
+
+#define GDLM_ATTR(_name,_mode,_show,_store) \
+static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
+
+GDLM_ATTR(proto_name,     0444, proto_name_show,     NULL);
+GDLM_ATTR(block,          0644, block_show,          block_store);
+GDLM_ATTR(withdraw,       0644, withdraw_show,       withdraw_store);
+GDLM_ATTR(id,             0444, id_show,             NULL);
+GDLM_ATTR(jid,            0444, jid_show,            NULL);
+GDLM_ATTR(first,          0444, first_show,          NULL);
+GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
+GDLM_ATTR(recover,        0644, recover_show,        recover_store);
+GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
+GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
+
+static struct attribute *gdlm_attrs[] = {
+	&gdlm_attr_proto_name.attr,
+	&gdlm_attr_block.attr,
+	&gdlm_attr_withdraw.attr,
+	&gdlm_attr_id.attr,
+	&gdlm_attr_jid.attr,
+	&gdlm_attr_first.attr,
+	&gdlm_attr_first_done.attr,
+	&gdlm_attr_recover.attr,
+	&gdlm_attr_recover_done.attr,
+	&gdlm_attr_recover_status.attr,
+	NULL,
+};
+
+static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
+	struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
+	return a->show ? a->show(ls, buf) : 0;
+}
+
+static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
+	struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
+	return a->store ? a->store(ls, buf, len) : len;
+}
+
+static struct sysfs_ops gdlm_attr_ops = {
+	.show  = gdlm_attr_show,
+	.store = gdlm_attr_store,
+};
+
+static struct kobj_type gdlm_ktype = {
+	.default_attrs = gdlm_attrs,
+	.sysfs_ops     = &gdlm_attr_ops,
+};
+
+static struct kset gdlm_kset = {
+	.subsys = &kernel_subsys,
+	.kobj   = {.name = "lock_dlm",},
+	.ktype  = &gdlm_ktype,
+};
+
+int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
+{
+	int error;
+
+	error = kobject_set_name(&ls->kobj, "%s", "lock_module");
+	if (error) {
+		log_error("can't set kobj name %d", error);
+		return error;
+	}
+
+	ls->kobj.kset = &gdlm_kset;
+	ls->kobj.ktype = &gdlm_ktype;
+	ls->kobj.parent = fskobj;
+
+	error = kobject_register(&ls->kobj);
+	if (error)
+		log_error("can't register kobj %d", error);
+
+	return error;
+}
+
+void gdlm_kobject_release(struct gdlm_ls *ls)
+{
+	kobject_unregister(&ls->kobj);
+}
+
+int gdlm_sysfs_init(void)
+{
+	int error;
+
+	error = kset_register(&gdlm_kset);
+	if (error)
+		printk("lock_dlm: cannot register kset %d\n", error);
+
+	return error;
+}
+
+void gdlm_sysfs_exit(void)
+{
+	kset_unregister(&gdlm_kset);
+}
+
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
new file mode 100644
index 000000000000..9cf1f168eaf8
--- /dev/null
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -0,0 +1,359 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include "lock_dlm.h"
+
+/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
+   thread gets to it. */
+
+static void queue_submit(struct gdlm_lock *lp)
+{
+	struct gdlm_ls *ls = lp->ls;
+
+	spin_lock(&ls->async_lock);
+	list_add_tail(&lp->delay_list, &ls->submit);
+	spin_unlock(&ls->async_lock);
+	wake_up(&ls->thread_wait);
+}
+
+static void process_blocking(struct gdlm_lock *lp, int bast_mode)
+{
+	struct gdlm_ls *ls = lp->ls;
+	unsigned int cb = 0;
+
+	switch (gdlm_make_lmstate(bast_mode)) {
+	case LM_ST_EXCLUSIVE:
+		cb = LM_CB_NEED_E;
+		break;
+	case LM_ST_DEFERRED:
+		cb = LM_CB_NEED_D;
+		break;
+	case LM_ST_SHARED:
+		cb = LM_CB_NEED_S;
+		break;
+	default:
+		gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
+	}
+
+	ls->fscb(ls->sdp, cb, &lp->lockname);
+}
+
+static void process_complete(struct gdlm_lock *lp)
+{
+	struct gdlm_ls *ls = lp->ls;
+	struct lm_async_cb acb;
+	s16 prev_mode = lp->cur;
+
+	memset(&acb, 0, sizeof(acb));
+
+	if (lp->lksb.sb_status == -DLM_ECANCEL) {
+		log_info("complete dlm cancel %x,%llx flags %lx",
+		 	 lp->lockname.ln_type,
+			 (unsigned long long)lp->lockname.ln_number,
+			 lp->flags);
+
+		lp->req = lp->cur;
+		acb.lc_ret |= LM_OUT_CANCELED;
+		if (lp->cur == DLM_LOCK_IV)
+			lp->lksb.sb_lkid = 0;
+		goto out;
+	}
+
+	if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
+		if (lp->lksb.sb_status != -DLM_EUNLOCK) {
+			log_info("unlock sb_status %d %x,%llx flags %lx",
+				 lp->lksb.sb_status, lp->lockname.ln_type,
+				 (unsigned long long)lp->lockname.ln_number,
+				 lp->flags);
+			return;
+		}
+
+		lp->cur = DLM_LOCK_IV;
+		lp->req = DLM_LOCK_IV;
+		lp->lksb.sb_lkid = 0;
+
+		if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
+			gdlm_delete_lp(lp);
+			return;
+		}
+		goto out;
+	}
+
+	if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
+		memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
+
+	if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
+		if (lp->req == DLM_LOCK_PR)
+			lp->req = DLM_LOCK_CW;
+		else if (lp->req == DLM_LOCK_CW)
+			lp->req = DLM_LOCK_PR;
+	}
+
+	/*
+	 * A canceled lock request.  The lock was just taken off the delayed
+	 * list and was never even submitted to dlm.
+	 */
+
+	if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
+		log_info("complete internal cancel %x,%llx",
+		 	 lp->lockname.ln_type,
+			 (unsigned long long)lp->lockname.ln_number);
+		lp->req = lp->cur;
+		acb.lc_ret |= LM_OUT_CANCELED;
+		goto out;
+	}
+
+	/*
+	 * An error occured.
+	 */
+
+	if (lp->lksb.sb_status) {
+		/* a "normal" error */
+		if ((lp->lksb.sb_status == -EAGAIN) &&
+		    (lp->lkf & DLM_LKF_NOQUEUE)) {
+			lp->req = lp->cur;
+			if (lp->cur == DLM_LOCK_IV)
+				lp->lksb.sb_lkid = 0;
+			goto out;
+		}
+
+		/* this could only happen with cancels I think */
+		log_info("ast sb_status %d %x,%llx flags %lx",
+			 lp->lksb.sb_status, lp->lockname.ln_type,
+			 (unsigned long long)lp->lockname.ln_number,
+			 lp->flags);
+		return;
+	}
+
+	/*
+	 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
+	 */
+
+	if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
+		complete(&lp->ast_wait);
+		return;
+	}
+
+	/*
+	 * A lock has been demoted to NL because it initially completed during
+	 * BLOCK_LOCKS.  Now it must be requested in the originally requested
+	 * mode.
+	 */
+
+	if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
+		gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
+			    lp->lockname.ln_type,
+			    (unsigned long long)lp->lockname.ln_number);
+		gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
+			    lp->lockname.ln_type,
+			    (unsigned long long)lp->lockname.ln_number);
+
+		lp->cur = DLM_LOCK_NL;
+		lp->req = lp->prev_req;
+		lp->prev_req = DLM_LOCK_IV;
+		lp->lkf &= ~DLM_LKF_CONVDEADLK;
+
+		set_bit(LFL_NOCACHE, &lp->flags);
+
+		if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+		    !test_bit(LFL_NOBLOCK, &lp->flags))
+			gdlm_queue_delayed(lp);
+		else
+			queue_submit(lp);
+		return;
+	}
+
+	/*
+	 * A request is granted during dlm recovery.  It may be granted
+	 * because the locks of a failed node were cleared.  In that case,
+	 * there may be inconsistent data beneath this lock and we must wait
+	 * for recovery to complete to use it.  When gfs recovery is done this
+	 * granted lock will be converted to NL and then reacquired in this
+	 * granted state.
+	 */
+
+	if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+	    !test_bit(LFL_NOBLOCK, &lp->flags) &&
+	    lp->req != DLM_LOCK_NL) {
+
+		lp->cur = lp->req;
+		lp->prev_req = lp->req;
+		lp->req = DLM_LOCK_NL;
+		lp->lkf |= DLM_LKF_CONVERT;
+		lp->lkf &= ~DLM_LKF_CONVDEADLK;
+
+		log_debug("rereq %x,%llx id %x %d,%d",
+			  lp->lockname.ln_type,
+			  (unsigned long long)lp->lockname.ln_number,
+			  lp->lksb.sb_lkid, lp->cur, lp->req);
+
+		set_bit(LFL_REREQUEST, &lp->flags);
+		queue_submit(lp);
+		return;
+	}
+
+	/*
+	 * DLM demoted the lock to NL before it was granted so GFS must be
+	 * told it cannot cache data for this lock.
+	 */
+
+	if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
+		set_bit(LFL_NOCACHE, &lp->flags);
+
+out:
+	/*
+	 * This is an internal lock_dlm lock
+	 */
+
+	if (test_bit(LFL_INLOCK, &lp->flags)) {
+		clear_bit(LFL_NOBLOCK, &lp->flags);
+		lp->cur = lp->req;
+		complete(&lp->ast_wait);
+		return;
+	}
+
+	/*
+	 * Normal completion of a lock request.  Tell GFS it now has the lock.
+	 */
+
+	clear_bit(LFL_NOBLOCK, &lp->flags);
+	lp->cur = lp->req;
+
+	acb.lc_name = lp->lockname;
+	acb.lc_ret |= gdlm_make_lmstate(lp->cur);
+
+	if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
+	    (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
+		acb.lc_ret |= LM_OUT_CACHEABLE;
+
+	ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
+}
+
+static inline int no_work(struct gdlm_ls *ls, int blocking)
+{
+	int ret;
+
+	spin_lock(&ls->async_lock);
+	ret = list_empty(&ls->complete) && list_empty(&ls->submit);
+	if (ret && blocking)
+		ret = list_empty(&ls->blocking);
+	spin_unlock(&ls->async_lock);
+
+	return ret;
+}
+
+static inline int check_drop(struct gdlm_ls *ls)
+{
+	if (!ls->drop_locks_count)
+		return 0;
+
+	if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
+		ls->drop_time = jiffies;
+		if (ls->all_locks_count >= ls->drop_locks_count)
+			return 1;
+	}
+	return 0;
+}
+
+static int gdlm_thread(void *data)
+{
+	struct gdlm_ls *ls = (struct gdlm_ls *) data;
+	struct gdlm_lock *lp = NULL;
+	int blist = 0;
+	uint8_t complete, blocking, submit, drop;
+	DECLARE_WAITQUEUE(wait, current);
+
+	/* Only thread1 is allowed to do blocking callbacks since gfs
+	   may wait for a completion callback within a blocking cb. */
+
+	if (current == ls->thread1)
+		blist = 1;
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		add_wait_queue(&ls->thread_wait, &wait);
+		if (no_work(ls, blist))
+			schedule();
+		remove_wait_queue(&ls->thread_wait, &wait);
+		set_current_state(TASK_RUNNING);
+
+		complete = blocking = submit = drop = 0;
+
+		spin_lock(&ls->async_lock);
+
+		if (blist && !list_empty(&ls->blocking)) {
+			lp = list_entry(ls->blocking.next, struct gdlm_lock,
+					blist);
+			list_del_init(&lp->blist);
+			blocking = lp->bast_mode;
+			lp->bast_mode = 0;
+		} else if (!list_empty(&ls->complete)) {
+			lp = list_entry(ls->complete.next, struct gdlm_lock,
+					clist);
+			list_del_init(&lp->clist);
+			complete = 1;
+		} else if (!list_empty(&ls->submit)) {
+			lp = list_entry(ls->submit.next, struct gdlm_lock,
+					delay_list);
+			list_del_init(&lp->delay_list);
+			submit = 1;
+		}
+
+		drop = check_drop(ls);
+		spin_unlock(&ls->async_lock);
+
+		if (complete)
+			process_complete(lp);
+
+		else if (blocking)
+			process_blocking(lp, blocking);
+
+		else if (submit)
+			gdlm_do_lock(lp);
+
+		if (drop)
+			ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL);
+
+		schedule();
+	}
+
+	return 0;
+}
+
+int gdlm_init_threads(struct gdlm_ls *ls)
+{
+	struct task_struct *p;
+	int error;
+
+	p = kthread_run(gdlm_thread, ls, "lock_dlm1");
+	error = IS_ERR(p);
+	if (error) {
+		log_error("can't start lock_dlm1 thread %d", error);
+		return error;
+	}
+	ls->thread1 = p;
+
+	p = kthread_run(gdlm_thread, ls, "lock_dlm2");
+	error = IS_ERR(p);
+	if (error) {
+		log_error("can't start lock_dlm2 thread %d", error);
+		kthread_stop(ls->thread1);
+		return error;
+	}
+	ls->thread2 = p;
+
+	return 0;
+}
+
+void gdlm_release_threads(struct gdlm_ls *ls)
+{
+	kthread_stop(ls->thread1);
+	kthread_stop(ls->thread2);
+}
+
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
new file mode 100644
index 000000000000..35e9730bc3a8
--- /dev/null
+++ b/fs/gfs2/locking/nolock/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += lock_nolock.o
+lock_nolock-y := main.o
+
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
new file mode 100644
index 000000000000..acfbc941f319
--- /dev/null
+++ b/fs/gfs2/locking/nolock/main.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/lm_interface.h>
+
+struct nolock_lockspace {
+	unsigned int nl_lvb_size;
+};
+
+static const struct lm_lockops nolock_ops;
+
+static int nolock_mount(char *table_name, char *host_data,
+			lm_callback_t cb, void *cb_data,
+			unsigned int min_lvb_size, int flags,
+			struct lm_lockstruct *lockstruct,
+			struct kobject *fskobj)
+{
+	char *c;
+	unsigned int jid;
+	struct nolock_lockspace *nl;
+
+	c = strstr(host_data, "jid=");
+	if (!c)
+		jid = 0;
+	else {
+		c += 4;
+		sscanf(c, "%u", &jid);
+	}
+
+	nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
+	if (!nl)
+		return -ENOMEM;
+
+	nl->nl_lvb_size = min_lvb_size;
+
+	lockstruct->ls_jid = jid;
+	lockstruct->ls_first = 1;
+	lockstruct->ls_lvb_size = min_lvb_size;
+	lockstruct->ls_lockspace = nl;
+	lockstruct->ls_ops = &nolock_ops;
+	lockstruct->ls_flags = LM_LSFLAG_LOCAL;
+
+	return 0;
+}
+
+static void nolock_others_may_mount(void *lockspace)
+{
+}
+
+static void nolock_unmount(void *lockspace)
+{
+	struct nolock_lockspace *nl = lockspace;
+	kfree(nl);
+}
+
+static void nolock_withdraw(void *lockspace)
+{
+}
+
+/**
+ * nolock_get_lock - get a lm_lock_t given a descripton of the lock
+ * @lockspace: the lockspace the lock lives in
+ * @name: the name of the lock
+ * @lockp: return the lm_lock_t here
+ *
+ * Returns: 0 on success, -EXXX on failure
+ */
+
+static int nolock_get_lock(void *lockspace, struct lm_lockname *name,
+			   void **lockp)
+{
+	*lockp = lockspace;
+	return 0;
+}
+
+/**
+ * nolock_put_lock - get rid of a lock structure
+ * @lock: the lock to throw away
+ *
+ */
+
+static void nolock_put_lock(void *lock)
+{
+}
+
+/**
+ * nolock_lock - acquire a lock
+ * @lock: the lock to manipulate
+ * @cur_state: the current state
+ * @req_state: the requested state
+ * @flags: modifier flags
+ *
+ * Returns: A bitmap of LM_OUT_*
+ */
+
+static unsigned int nolock_lock(void *lock, unsigned int cur_state,
+				unsigned int req_state, unsigned int flags)
+{
+	return req_state | LM_OUT_CACHEABLE;
+}
+
+/**
+ * nolock_unlock - unlock a lock
+ * @lock: the lock to manipulate
+ * @cur_state: the current state
+ *
+ * Returns: 0
+ */
+
+static unsigned int nolock_unlock(void *lock, unsigned int cur_state)
+{
+	return 0;
+}
+
+static void nolock_cancel(void *lock)
+{
+}
+
+/**
+ * nolock_hold_lvb - hold on to a lock value block
+ * @lock: the lock the LVB is associated with
+ * @lvbp: return the lm_lvb_t here
+ *
+ * Returns: 0 on success, -EXXX on failure
+ */
+
+static int nolock_hold_lvb(void *lock, char **lvbp)
+{
+	struct nolock_lockspace *nl = lock;
+	int error = 0;
+
+	*lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL);
+	if (!*lvbp)
+		error = -ENOMEM;
+
+	return error;
+}
+
+/**
+ * nolock_unhold_lvb - release a LVB
+ * @lock: the lock the LVB is associated with
+ * @lvb: the lock value block
+ *
+ */
+
+static void nolock_unhold_lvb(void *lock, char *lvb)
+{
+	kfree(lvb);
+}
+
+static int nolock_plock_get(void *lockspace, struct lm_lockname *name,
+			    struct file *file, struct file_lock *fl)
+{
+	struct file_lock tmp;
+	int ret;
+
+	ret = posix_test_lock(file, fl, &tmp);
+	fl->fl_type = F_UNLCK;
+	if (ret)
+		memcpy(fl, &tmp, sizeof(struct file_lock));
+
+	return 0;
+}
+
+static int nolock_plock(void *lockspace, struct lm_lockname *name,
+			struct file *file, int cmd, struct file_lock *fl)
+{
+	int error;
+	error = posix_lock_file_wait(file, fl);
+	return error;
+}
+
+static int nolock_punlock(void *lockspace, struct lm_lockname *name,
+			  struct file *file, struct file_lock *fl)
+{
+	int error;
+	error = posix_lock_file_wait(file, fl);
+	return error;
+}
+
+static void nolock_recovery_done(void *lockspace, unsigned int jid,
+				 unsigned int message)
+{
+}
+
+static const struct lm_lockops nolock_ops = {
+	.lm_proto_name = "lock_nolock",
+	.lm_mount = nolock_mount,
+	.lm_others_may_mount = nolock_others_may_mount,
+	.lm_unmount = nolock_unmount,
+	.lm_withdraw = nolock_withdraw,
+	.lm_get_lock = nolock_get_lock,
+	.lm_put_lock = nolock_put_lock,
+	.lm_lock = nolock_lock,
+	.lm_unlock = nolock_unlock,
+	.lm_cancel = nolock_cancel,
+	.lm_hold_lvb = nolock_hold_lvb,
+	.lm_unhold_lvb = nolock_unhold_lvb,
+	.lm_plock_get = nolock_plock_get,
+	.lm_plock = nolock_plock,
+	.lm_punlock = nolock_punlock,
+	.lm_recovery_done = nolock_recovery_done,
+	.lm_owner = THIS_MODULE,
+};
+
+static int __init init_nolock(void)
+{
+	int error;
+
+	error = gfs2_register_lockproto(&nolock_ops);
+	if (error) {
+		printk(KERN_WARNING
+		       "lock_nolock: can't register protocol: %d\n", error);
+		return error;
+	}
+
+	printk(KERN_INFO
+	       "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
+	return 0;
+}
+
+static void __exit exit_nolock(void)
+{
+	gfs2_unregister_lockproto(&nolock_ops);
+}
+
+module_init(init_nolock);
+module_exit(exit_nolock);
+
+MODULE_DESCRIPTION("GFS Nolock Locking Module");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
new file mode 100644
index 000000000000..0cace3da9dbb
--- /dev/null
+++ b/fs/gfs2/log.c
@@ -0,0 +1,688 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "util.h"
+#include "dir.h"
+
+#define PULL 1
+
+/**
+ * gfs2_struct2blk - compute stuff
+ * @sdp: the filesystem
+ * @nstruct: the number of structures
+ * @ssize: the size of the structures
+ *
+ * Compute the number of log descriptor blocks needed to hold a certain number
+ * of structures of a certain size.
+ *
+ * Returns: the number of blocks needed (minimum is always 1)
+ */
+
+unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
+			     unsigned int ssize)
+{
+	unsigned int blks;
+	unsigned int first, second;
+
+	blks = 1;
+	first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / ssize;
+
+	if (nstruct > first) {
+		second = (sdp->sd_sb.sb_bsize -
+			  sizeof(struct gfs2_meta_header)) / ssize;
+		blks += DIV_ROUND_UP(nstruct - first, second);
+	}
+
+	return blks;
+}
+
+/**
+ * gfs2_ail1_start_one - Start I/O on a part of the AIL
+ * @sdp: the filesystem
+ * @tr: the part of the AIL
+ *
+ */
+
+static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+	struct gfs2_bufdata *bd, *s;
+	struct buffer_head *bh;
+	int retry;
+
+	BUG_ON(!spin_is_locked(&sdp->sd_log_lock));
+
+	do {
+		retry = 0;
+
+		list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
+						 bd_ail_st_list) {
+			bh = bd->bd_bh;
+
+			gfs2_assert(sdp, bd->bd_ail == ai);
+
+			if (!buffer_busy(bh)) {
+				if (!buffer_uptodate(bh)) {
+					gfs2_log_unlock(sdp);
+					gfs2_io_error_bh(sdp, bh);
+					gfs2_log_lock(sdp);
+				}
+				list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
+				continue;
+			}
+
+			if (!buffer_dirty(bh))
+				continue;
+
+			list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+
+			gfs2_log_unlock(sdp);
+			wait_on_buffer(bh);
+			ll_rw_block(WRITE, 1, &bh);
+			gfs2_log_lock(sdp);
+
+			retry = 1;
+			break;
+		}
+	} while (retry);
+}
+
+/**
+ * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced
+ * @sdp: the filesystem
+ * @ai: the AIL entry
+ *
+ */
+
+static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags)
+{
+	struct gfs2_bufdata *bd, *s;
+	struct buffer_head *bh;
+
+	list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
+					 bd_ail_st_list) {
+		bh = bd->bd_bh;
+
+		gfs2_assert(sdp, bd->bd_ail == ai);
+
+		if (buffer_busy(bh)) {
+			if (flags & DIO_ALL)
+				continue;
+			else
+				break;
+		}
+
+		if (!buffer_uptodate(bh))
+			gfs2_io_error_bh(sdp, bh);
+
+		list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
+	}
+
+	return list_empty(&ai->ai_ail1_list);
+}
+
+void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
+{
+	struct list_head *head = &sdp->sd_ail1_list;
+	u64 sync_gen;
+	struct list_head *first;
+	struct gfs2_ail *first_ai, *ai, *tmp;
+	int done = 0;
+
+	gfs2_log_lock(sdp);
+	if (list_empty(head)) {
+		gfs2_log_unlock(sdp);
+		return;
+	}
+	sync_gen = sdp->sd_ail_sync_gen++;
+
+	first = head->prev;
+	first_ai = list_entry(first, struct gfs2_ail, ai_list);
+	first_ai->ai_sync_gen = sync_gen;
+	gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */
+
+	if (flags & DIO_ALL)
+		first = NULL;
+
+	while(!done) {
+		if (first && (head->prev != first ||
+			      gfs2_ail1_empty_one(sdp, first_ai, 0)))
+			break;
+
+		done = 1;
+		list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) {
+			if (ai->ai_sync_gen >= sync_gen)
+				continue;
+			ai->ai_sync_gen = sync_gen;
+			gfs2_ail1_start_one(sdp, ai); /* This may drop log lock */
+			done = 0;
+			break;
+		}
+	}
+
+	gfs2_log_unlock(sdp);
+}
+
+int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
+{
+	struct gfs2_ail *ai, *s;
+	int ret;
+
+	gfs2_log_lock(sdp);
+
+	list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
+		if (gfs2_ail1_empty_one(sdp, ai, flags))
+			list_move(&ai->ai_list, &sdp->sd_ail2_list);
+		else if (!(flags & DIO_ALL))
+			break;
+	}
+
+	ret = list_empty(&sdp->sd_ail1_list);
+
+	gfs2_log_unlock(sdp);
+
+	return ret;
+}
+
+
+/**
+ * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
+ * @sdp: the filesystem
+ * @ai: the AIL entry
+ *
+ */
+
+static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+	struct list_head *head = &ai->ai_ail2_list;
+	struct gfs2_bufdata *bd;
+
+	while (!list_empty(head)) {
+		bd = list_entry(head->prev, struct gfs2_bufdata,
+				bd_ail_st_list);
+		gfs2_assert(sdp, bd->bd_ail == ai);
+		bd->bd_ail = NULL;
+		list_del(&bd->bd_ail_st_list);
+		list_del(&bd->bd_ail_gl_list);
+		atomic_dec(&bd->bd_gl->gl_ail_count);
+		brelse(bd->bd_bh);
+	}
+}
+
+static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
+{
+	struct gfs2_ail *ai, *safe;
+	unsigned int old_tail = sdp->sd_log_tail;
+	int wrap = (new_tail < old_tail);
+	int a, b, rm;
+
+	gfs2_log_lock(sdp);
+
+	list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
+		a = (old_tail <= ai->ai_first);
+		b = (ai->ai_first < new_tail);
+		rm = (wrap) ? (a || b) : (a && b);
+		if (!rm)
+			continue;
+
+		gfs2_ail2_empty_one(sdp, ai);
+		list_del(&ai->ai_list);
+		gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list));
+		gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list));
+		kfree(ai);
+	}
+
+	gfs2_log_unlock(sdp);
+}
+
+/**
+ * gfs2_log_reserve - Make a log reservation
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks to reserve
+ *
+ * Returns: errno
+ */
+
+int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
+{
+	unsigned int try = 0;
+
+	if (gfs2_assert_warn(sdp, blks) ||
+	    gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
+		return -EINVAL;
+
+	mutex_lock(&sdp->sd_log_reserve_mutex);
+	gfs2_log_lock(sdp);
+	while(sdp->sd_log_blks_free <= blks) {
+		gfs2_log_unlock(sdp);
+		gfs2_ail1_empty(sdp, 0);
+		gfs2_log_flush(sdp, NULL);
+
+		if (try++)
+			gfs2_ail1_start(sdp, 0);
+		gfs2_log_lock(sdp);
+	}
+	sdp->sd_log_blks_free -= blks;
+	gfs2_log_unlock(sdp);
+	mutex_unlock(&sdp->sd_log_reserve_mutex);
+
+	down_read(&sdp->sd_log_flush_lock);
+
+	return 0;
+}
+
+/**
+ * gfs2_log_release - Release a given number of log blocks
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks
+ *
+ */
+
+void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
+{
+
+	gfs2_log_lock(sdp);
+	sdp->sd_log_blks_free += blks;
+	gfs2_assert_withdraw(sdp,
+			     sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
+	gfs2_log_unlock(sdp);
+	up_read(&sdp->sd_log_flush_lock);
+}
+
+static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
+{
+	struct inode *inode = sdp->sd_jdesc->jd_inode;
+	int error;
+	struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+
+	bh_map.b_size = 1 << inode->i_blkbits;
+	error = gfs2_block_map(inode, lbn, 0, &bh_map);
+	if (error || !bh_map.b_blocknr)
+		printk(KERN_INFO "error=%d, dbn=%llu lbn=%u", error, bh_map.b_blocknr, lbn);
+	gfs2_assert_withdraw(sdp, !error && bh_map.b_blocknr);
+
+	return bh_map.b_blocknr;
+}
+
+/**
+ * log_distance - Compute distance between two journal blocks
+ * @sdp: The GFS2 superblock
+ * @newer: The most recent journal block of the pair
+ * @older: The older journal block of the pair
+ *
+ *   Compute the distance (in the journal direction) between two
+ *   blocks in the journal
+ *
+ * Returns: the distance in blocks
+ */
+
+static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer,
+					unsigned int older)
+{
+	int dist;
+
+	dist = newer - older;
+	if (dist < 0)
+		dist += sdp->sd_jdesc->jd_blocks;
+
+	return dist;
+}
+
+static unsigned int current_tail(struct gfs2_sbd *sdp)
+{
+	struct gfs2_ail *ai;
+	unsigned int tail;
+
+	gfs2_log_lock(sdp);
+
+	if (list_empty(&sdp->sd_ail1_list)) {
+		tail = sdp->sd_log_head;
+	} else {
+		ai = list_entry(sdp->sd_ail1_list.prev, struct gfs2_ail, ai_list);
+		tail = ai->ai_first;
+	}
+
+	gfs2_log_unlock(sdp);
+
+	return tail;
+}
+
+static inline void log_incr_head(struct gfs2_sbd *sdp)
+{
+	if (sdp->sd_log_flush_head == sdp->sd_log_tail)
+		gfs2_assert_withdraw(sdp, sdp->sd_log_flush_head == sdp->sd_log_head);
+
+	if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
+		sdp->sd_log_flush_head = 0;
+		sdp->sd_log_flush_wrapped = 1;
+	}
+}
+
+/**
+ * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: the buffer_head
+ */
+
+struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
+{
+	u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
+	struct gfs2_log_buf *lb;
+	struct buffer_head *bh;
+
+	lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
+	list_add(&lb->lb_list, &sdp->sd_log_flush_list);
+
+	bh = lb->lb_bh = sb_getblk(sdp->sd_vfs, blkno);
+	lock_buffer(bh);
+	memset(bh->b_data, 0, bh->b_size);
+	set_buffer_uptodate(bh);
+	clear_buffer_dirty(bh);
+	unlock_buffer(bh);
+
+	log_incr_head(sdp);
+
+	return bh;
+}
+
+/**
+ * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
+ * @sdp: the filesystem
+ * @data: the data the buffer_head should point to
+ *
+ * Returns: the log buffer descriptor
+ */
+
+struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
+				      struct buffer_head *real)
+{
+	u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
+	struct gfs2_log_buf *lb;
+	struct buffer_head *bh;
+
+	lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
+	list_add(&lb->lb_list, &sdp->sd_log_flush_list);
+	lb->lb_real = real;
+
+	bh = lb->lb_bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
+	atomic_set(&bh->b_count, 1);
+	bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate);
+	set_bh_page(bh, real->b_page, bh_offset(real));
+	bh->b_blocknr = blkno;
+	bh->b_size = sdp->sd_sb.sb_bsize;
+	bh->b_bdev = sdp->sd_vfs->s_bdev;
+
+	log_incr_head(sdp);
+
+	return bh;
+}
+
+static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail, int pull)
+{
+	unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail);
+
+	ail2_empty(sdp, new_tail);
+
+	gfs2_log_lock(sdp);
+	sdp->sd_log_blks_free += dist - (pull ? 1 : 0);
+	gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks);
+	gfs2_log_unlock(sdp);
+
+	sdp->sd_log_tail = new_tail;
+}
+
+/**
+ * log_write_header - Get and initialize a journal header buffer
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: the initialized log buffer descriptor
+ */
+
+static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
+{
+	u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
+	struct buffer_head *bh;
+	struct gfs2_log_header *lh;
+	unsigned int tail;
+	u32 hash;
+
+	bh = sb_getblk(sdp->sd_vfs, blkno);
+	lock_buffer(bh);
+	memset(bh->b_data, 0, bh->b_size);
+	set_buffer_uptodate(bh);
+	clear_buffer_dirty(bh);
+	unlock_buffer(bh);
+
+	gfs2_ail1_empty(sdp, 0);
+	tail = current_tail(sdp);
+
+	lh = (struct gfs2_log_header *)bh->b_data;
+	memset(lh, 0, sizeof(struct gfs2_log_header));
+	lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+	lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+	lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+	lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
+	lh->lh_flags = cpu_to_be32(flags);
+	lh->lh_tail = cpu_to_be32(tail);
+	lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head);
+	hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
+	lh->lh_hash = cpu_to_be32(hash);
+
+	set_buffer_dirty(bh);
+	if (sync_dirty_buffer(bh))
+		gfs2_io_error_bh(sdp, bh);
+	brelse(bh);
+
+	if (sdp->sd_log_tail != tail)
+		log_pull_tail(sdp, tail, pull);
+	else
+		gfs2_assert_withdraw(sdp, !pull);
+
+	sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
+	log_incr_head(sdp);
+}
+
+static void log_flush_commit(struct gfs2_sbd *sdp)
+{
+	struct list_head *head = &sdp->sd_log_flush_list;
+	struct gfs2_log_buf *lb;
+	struct buffer_head *bh;
+
+	while (!list_empty(head)) {
+		lb = list_entry(head->next, struct gfs2_log_buf, lb_list);
+		list_del(&lb->lb_list);
+		bh = lb->lb_bh;
+
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh))
+			gfs2_io_error_bh(sdp, bh);
+		if (lb->lb_real) {
+			while (atomic_read(&bh->b_count) != 1)  /* Grrrr... */
+				schedule();
+			free_buffer_head(bh);
+		} else
+			brelse(bh);
+		kfree(lb);
+	}
+
+	log_write_header(sdp, 0, 0);
+}
+
+/**
+ * gfs2_log_flush - flush incore transaction(s)
+ * @sdp: the filesystem
+ * @gl: The glock structure to flush.  If NULL, flush the whole incore log
+ *
+ */
+
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
+{
+	struct gfs2_ail *ai;
+
+	down_write(&sdp->sd_log_flush_lock);
+
+	if (gl) {
+		gfs2_log_lock(sdp);
+		if (list_empty(&gl->gl_le.le_list)) {
+			gfs2_log_unlock(sdp);
+			up_write(&sdp->sd_log_flush_lock);
+			return;
+		}
+		gfs2_log_unlock(sdp);
+	}
+
+	ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
+	INIT_LIST_HEAD(&ai->ai_ail1_list);
+	INIT_LIST_HEAD(&ai->ai_ail2_list);
+
+	gfs2_assert_withdraw(sdp, sdp->sd_log_num_buf == sdp->sd_log_commited_buf);
+	gfs2_assert_withdraw(sdp,
+			sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
+
+	sdp->sd_log_flush_head = sdp->sd_log_head;
+	sdp->sd_log_flush_wrapped = 0;
+	ai->ai_first = sdp->sd_log_flush_head;
+
+	lops_before_commit(sdp);
+	if (!list_empty(&sdp->sd_log_flush_list))
+		log_flush_commit(sdp);
+	else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle)
+		log_write_header(sdp, 0, PULL);
+	lops_after_commit(sdp, ai);
+
+	gfs2_log_lock(sdp);
+	sdp->sd_log_head = sdp->sd_log_flush_head;
+	sdp->sd_log_blks_free -= sdp->sd_log_num_hdrs;
+	sdp->sd_log_blks_reserved = 0;
+	sdp->sd_log_commited_buf = 0;
+	sdp->sd_log_num_hdrs = 0;
+	sdp->sd_log_commited_revoke = 0;
+
+	if (!list_empty(&ai->ai_ail1_list)) {
+		list_add(&ai->ai_list, &sdp->sd_ail1_list);
+		ai = NULL;
+	}
+	gfs2_log_unlock(sdp);
+
+	sdp->sd_vfs->s_dirt = 0;
+	up_write(&sdp->sd_log_flush_lock);
+
+	kfree(ai);
+}
+
+static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+	unsigned int reserved = 0;
+	unsigned int old;
+
+	gfs2_log_lock(sdp);
+
+	sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm;
+	gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_buf) >= 0);
+	sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
+	gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
+
+	if (sdp->sd_log_commited_buf)
+		reserved += sdp->sd_log_commited_buf;
+	if (sdp->sd_log_commited_revoke)
+		reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
+					    sizeof(u64));
+	if (reserved)
+		reserved++;
+
+	old = sdp->sd_log_blks_free;
+	sdp->sd_log_blks_free += tr->tr_reserved -
+				 (reserved - sdp->sd_log_blks_reserved);
+
+	gfs2_assert_withdraw(sdp, sdp->sd_log_blks_free >= old);
+	gfs2_assert_withdraw(sdp,
+			     sdp->sd_log_blks_free <= sdp->sd_jdesc->jd_blocks +
+			     sdp->sd_log_num_hdrs);
+
+	sdp->sd_log_blks_reserved = reserved;
+
+	gfs2_log_unlock(sdp);
+}
+
+/**
+ * gfs2_log_commit - Commit a transaction to the log
+ * @sdp: the filesystem
+ * @tr: the transaction
+ *
+ * Returns: errno
+ */
+
+void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+	log_refund(sdp, tr);
+	lops_incore_commit(sdp, tr);
+
+	sdp->sd_vfs->s_dirt = 1;
+	up_read(&sdp->sd_log_flush_lock);
+
+	gfs2_log_lock(sdp);
+	if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks)) {
+		gfs2_log_unlock(sdp);
+		gfs2_log_flush(sdp, NULL);
+	} else {
+		gfs2_log_unlock(sdp);
+	}
+}
+
+/**
+ * gfs2_log_shutdown - write a shutdown header into a journal
+ * @sdp: the filesystem
+ *
+ */
+
+void gfs2_log_shutdown(struct gfs2_sbd *sdp)
+{
+	down_write(&sdp->sd_log_flush_lock);
+
+	gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
+	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
+	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
+	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_jdata);
+	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
+	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
+	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
+	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_hdrs);
+	gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
+
+	sdp->sd_log_flush_head = sdp->sd_log_head;
+	sdp->sd_log_flush_wrapped = 0;
+
+	log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT, 0);
+
+	gfs2_assert_warn(sdp, sdp->sd_log_blks_free == sdp->sd_jdesc->jd_blocks);
+	gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
+	gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
+
+	sdp->sd_log_head = sdp->sd_log_flush_head;
+	sdp->sd_log_tail = sdp->sd_log_head;
+
+	up_write(&sdp->sd_log_flush_lock);
+}
+
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
new file mode 100644
index 000000000000..7f5737d55612
--- /dev/null
+++ b/fs/gfs2/log.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __LOG_DOT_H__
+#define __LOG_DOT_H__
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include "incore.h"
+
+/**
+ * gfs2_log_lock - acquire the right to mess with the log manager
+ * @sdp: the filesystem
+ *
+ */
+
+static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
+{
+	spin_lock(&sdp->sd_log_lock);
+}
+
+/**
+ * gfs2_log_unlock - release the right to mess with the log manager
+ * @sdp: the filesystem
+ *
+ */
+
+static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
+{
+	spin_unlock(&sdp->sd_log_lock);
+}
+
+static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
+					  unsigned int value)
+{
+	if (++value == sdp->sd_jdesc->jd_blocks) {
+		value = 0;
+	}
+	sdp->sd_log_head = sdp->sd_log_tail = value;
+}
+
+unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
+			    unsigned int ssize);
+
+void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags);
+int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
+
+int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
+void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
+
+struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
+struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
+				      struct buffer_head *real);
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
+
+void gfs2_log_shutdown(struct gfs2_sbd *sdp);
+
+#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
new file mode 100644
index 000000000000..ab6d1115f95d
--- /dev/null
+++ b/fs/gfs2/lops.c
@@ -0,0 +1,809 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+
+static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+	struct gfs2_glock *gl;
+	struct gfs2_trans *tr = current->journal_info;
+
+	tr->tr_touched = 1;
+
+	if (!list_empty(&le->le_list))
+		return;
+
+	gl = container_of(le, struct gfs2_glock, gl_le);
+	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
+		return;
+	gfs2_glock_hold(gl);
+	set_bit(GLF_DIRTY, &gl->gl_flags);
+
+	gfs2_log_lock(sdp);
+	sdp->sd_log_num_gl++;
+	list_add(&le->le_list, &sdp->sd_log_le_gl);
+	gfs2_log_unlock(sdp);
+}
+
+static void glock_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+	struct list_head *head = &sdp->sd_log_le_gl;
+	struct gfs2_glock *gl;
+
+	while (!list_empty(head)) {
+		gl = list_entry(head->next, struct gfs2_glock, gl_le.le_list);
+		list_del_init(&gl->gl_le.le_list);
+		sdp->sd_log_num_gl--;
+
+		gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl));
+		gfs2_glock_put(gl);
+	}
+	gfs2_assert_warn(sdp, !sdp->sd_log_num_gl);
+}
+
+static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+	struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+	struct gfs2_trans *tr;
+
+	if (!list_empty(&bd->bd_list_tr))
+		return;
+
+	tr = current->journal_info;
+	tr->tr_touched = 1;
+	tr->tr_num_buf++;
+	list_add(&bd->bd_list_tr, &tr->tr_list_buf);
+
+	if (!list_empty(&le->le_list))
+		return;
+
+	gfs2_trans_add_gl(bd->bd_gl);
+
+	gfs2_meta_check(sdp, bd->bd_bh);
+	gfs2_pin(sdp, bd->bd_bh);
+
+	gfs2_log_lock(sdp);
+	sdp->sd_log_num_buf++;
+	list_add(&le->le_list, &sdp->sd_log_le_buf);
+	gfs2_log_unlock(sdp);
+
+	tr->tr_num_buf_new++;
+}
+
+static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
+{
+	struct list_head *head = &tr->tr_list_buf;
+	struct gfs2_bufdata *bd;
+
+	while (!list_empty(head)) {
+		bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
+		list_del_init(&bd->bd_list_tr);
+		tr->tr_num_buf--;
+	}
+	gfs2_assert_warn(sdp, !tr->tr_num_buf);
+}
+
+static void buf_lo_before_commit(struct gfs2_sbd *sdp)
+{
+	struct buffer_head *bh;
+	struct gfs2_log_descriptor *ld;
+	struct gfs2_bufdata *bd1 = NULL, *bd2;
+	unsigned int total = sdp->sd_log_num_buf;
+	unsigned int offset = sizeof(struct gfs2_log_descriptor);
+	unsigned int limit;
+	unsigned int num;
+	unsigned n;
+	__be64 *ptr;
+
+	offset += sizeof(__be64) - 1;
+	offset &= ~(sizeof(__be64) - 1);
+	limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
+	/* for 4k blocks, limit = 503 */
+
+	bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list);
+	while(total) {
+		num = total;
+		if (total > limit)
+			num = limit;
+		bh = gfs2_log_get_buf(sdp);
+		sdp->sd_log_num_hdrs++;
+		ld = (struct gfs2_log_descriptor *)bh->b_data;
+		ptr = (__be64 *)(bh->b_data + offset);
+		ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+		ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
+		ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
+		ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_METADATA);
+		ld->ld_length = cpu_to_be32(num + 1);
+		ld->ld_data1 = cpu_to_be32(num);
+		ld->ld_data2 = cpu_to_be32(0);
+		memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
+
+		n = 0;
+		list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf,
+					     bd_le.le_list) {
+			*ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
+			if (++n >= num)
+				break;
+		}
+
+		set_buffer_dirty(bh);
+		ll_rw_block(WRITE, 1, &bh);
+
+		n = 0;
+		list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf,
+					     bd_le.le_list) {
+			bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
+			set_buffer_dirty(bh);
+			ll_rw_block(WRITE, 1, &bh);
+			if (++n >= num)
+				break;
+		}
+
+		total -= num;
+	}
+}
+
+static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+	struct list_head *head = &sdp->sd_log_le_buf;
+	struct gfs2_bufdata *bd;
+
+	while (!list_empty(head)) {
+		bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
+		list_del_init(&bd->bd_le.le_list);
+		sdp->sd_log_num_buf--;
+
+		gfs2_unpin(sdp, bd->bd_bh, ai);
+	}
+	gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
+}
+
+static void buf_lo_before_scan(struct gfs2_jdesc *jd,
+			       struct gfs2_log_header *head, int pass)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+
+	if (pass != 0)
+		return;
+
+	sdp->sd_found_blocks = 0;
+	sdp->sd_replayed_blocks = 0;
+}
+
+static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+				struct gfs2_log_descriptor *ld, __be64 *ptr,
+				int pass)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	struct gfs2_glock *gl = ip->i_gl;
+	unsigned int blks = be32_to_cpu(ld->ld_data1);
+	struct buffer_head *bh_log, *bh_ip;
+	u64 blkno;
+	int error = 0;
+
+	if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA)
+		return 0;
+
+	gfs2_replay_incr_blk(sdp, &start);
+
+	for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+		blkno = be64_to_cpu(*ptr++);
+
+		sdp->sd_found_blocks++;
+
+		if (gfs2_revoke_check(sdp, blkno, start))
+			continue;
+
+		error = gfs2_replay_read_block(jd, start, &bh_log);
+		if (error)
+			return error;
+
+		bh_ip = gfs2_meta_new(gl, blkno);
+		memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
+
+		if (gfs2_meta_check(sdp, bh_ip))
+			error = -EIO;
+		else
+			mark_buffer_dirty(bh_ip);
+
+		brelse(bh_log);
+		brelse(bh_ip);
+
+		if (error)
+			break;
+
+		sdp->sd_replayed_blocks++;
+	}
+
+	return error;
+}
+
+static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+
+	if (error) {
+		gfs2_meta_sync(ip->i_gl);
+		return;
+	}
+	if (pass != 1)
+		return;
+
+	gfs2_meta_sync(ip->i_gl);
+
+	fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
+	        jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
+}
+
+static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+	struct gfs2_trans *tr;
+
+	tr = current->journal_info;
+	tr->tr_touched = 1;
+	tr->tr_num_revoke++;
+
+	gfs2_log_lock(sdp);
+	sdp->sd_log_num_revoke++;
+	list_add(&le->le_list, &sdp->sd_log_le_revoke);
+	gfs2_log_unlock(sdp);
+}
+
+static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
+{
+	struct gfs2_log_descriptor *ld;
+	struct gfs2_meta_header *mh;
+	struct buffer_head *bh;
+	unsigned int offset;
+	struct list_head *head = &sdp->sd_log_le_revoke;
+	struct gfs2_revoke *rv;
+
+	if (!sdp->sd_log_num_revoke)
+		return;
+
+	bh = gfs2_log_get_buf(sdp);
+	ld = (struct gfs2_log_descriptor *)bh->b_data;
+	ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+	ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
+	ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
+	ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_REVOKE);
+	ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke,
+						    sizeof(u64)));
+	ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
+	ld->ld_data2 = cpu_to_be32(0);
+	memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
+	offset = sizeof(struct gfs2_log_descriptor);
+
+	while (!list_empty(head)) {
+		rv = list_entry(head->next, struct gfs2_revoke, rv_le.le_list);
+		list_del_init(&rv->rv_le.le_list);
+		sdp->sd_log_num_revoke--;
+
+		if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
+			set_buffer_dirty(bh);
+			ll_rw_block(WRITE, 1, &bh);
+
+			bh = gfs2_log_get_buf(sdp);
+			mh = (struct gfs2_meta_header *)bh->b_data;
+			mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
+			mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB);
+			mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB);
+			offset = sizeof(struct gfs2_meta_header);
+		}
+
+		*(__be64 *)(bh->b_data + offset) = cpu_to_be64(rv->rv_blkno);
+		kfree(rv);
+
+		offset += sizeof(u64);
+	}
+	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
+
+	set_buffer_dirty(bh);
+	ll_rw_block(WRITE, 1, &bh);
+}
+
+static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
+				  struct gfs2_log_header *head, int pass)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+
+	if (pass != 0)
+		return;
+
+	sdp->sd_found_revokes = 0;
+	sdp->sd_replay_tail = head->lh_tail;
+}
+
+static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+				   struct gfs2_log_descriptor *ld, __be64 *ptr,
+				   int pass)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	unsigned int blks = be32_to_cpu(ld->ld_length);
+	unsigned int revokes = be32_to_cpu(ld->ld_data1);
+	struct buffer_head *bh;
+	unsigned int offset;
+	u64 blkno;
+	int first = 1;
+	int error;
+
+	if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE)
+		return 0;
+
+	offset = sizeof(struct gfs2_log_descriptor);
+
+	for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+		error = gfs2_replay_read_block(jd, start, &bh);
+		if (error)
+			return error;
+
+		if (!first)
+			gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB);
+
+		while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) {
+			blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
+
+			error = gfs2_revoke_add(sdp, blkno, start);
+			if (error < 0)
+				return error;
+			else if (error)
+				sdp->sd_found_revokes++;
+
+			if (!--revokes)
+				break;
+			offset += sizeof(u64);
+		}
+
+		brelse(bh);
+		offset = sizeof(struct gfs2_meta_header);
+		first = 0;
+	}
+
+	return 0;
+}
+
+static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+
+	if (error) {
+		gfs2_revoke_clean(sdp);
+		return;
+	}
+	if (pass != 1)
+		return;
+
+	fs_info(sdp, "jid=%u: Found %u revoke tags\n",
+	        jd->jd_jid, sdp->sd_found_revokes);
+
+	gfs2_revoke_clean(sdp);
+}
+
+static void rg_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_trans *tr = current->journal_info;
+
+	tr->tr_touched = 1;
+
+	if (!list_empty(&le->le_list))
+		return;
+
+	rgd = container_of(le, struct gfs2_rgrpd, rd_le);
+	gfs2_rgrp_bh_hold(rgd);
+
+	gfs2_log_lock(sdp);
+	sdp->sd_log_num_rg++;
+	list_add(&le->le_list, &sdp->sd_log_le_rg);
+	gfs2_log_unlock(sdp);
+}
+
+static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+	struct list_head *head = &sdp->sd_log_le_rg;
+	struct gfs2_rgrpd *rgd;
+
+	while (!list_empty(head)) {
+		rgd = list_entry(head->next, struct gfs2_rgrpd, rd_le.le_list);
+		list_del_init(&rgd->rd_le.le_list);
+		sdp->sd_log_num_rg--;
+
+		gfs2_rgrp_repolish_clones(rgd);
+		gfs2_rgrp_bh_put(rgd);
+	}
+	gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
+}
+
+/**
+ * databuf_lo_add - Add a databuf to the transaction.
+ *
+ * This is used in two distinct cases:
+ * i) In ordered write mode
+ *    We put the data buffer on a list so that we can ensure that its
+ *    synced to disk at the right time
+ * ii) In journaled data mode
+ *    We need to journal the data block in the same way as metadata in
+ *    the functions above. The difference is that here we have a tag
+ *    which is two __be64's being the block number (as per meta data)
+ *    and a flag which says whether the data block needs escaping or
+ *    not. This means we need a new log entry for each 251 or so data
+ *    blocks, which isn't an enormous overhead but twice as much as
+ *    for normal metadata blocks.
+ */
+static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+	struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+	struct gfs2_trans *tr = current->journal_info;
+	struct address_space *mapping = bd->bd_bh->b_page->mapping;
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+
+	tr->tr_touched = 1;
+	if (list_empty(&bd->bd_list_tr) &&
+	    (ip->i_di.di_flags & GFS2_DIF_JDATA)) {
+		tr->tr_num_buf++;
+		list_add(&bd->bd_list_tr, &tr->tr_list_buf);
+		gfs2_pin(sdp, bd->bd_bh);
+		tr->tr_num_buf_new++;
+	}
+	gfs2_trans_add_gl(bd->bd_gl);
+	gfs2_log_lock(sdp);
+	if (list_empty(&le->le_list)) {
+		if (ip->i_di.di_flags & GFS2_DIF_JDATA)
+			sdp->sd_log_num_jdata++;
+		sdp->sd_log_num_databuf++;
+		list_add(&le->le_list, &sdp->sd_log_le_databuf);
+	}
+	gfs2_log_unlock(sdp);
+}
+
+static int gfs2_check_magic(struct buffer_head *bh)
+{
+	struct page *page = bh->b_page;
+	void *kaddr;
+	__be32 *ptr;
+	int rv = 0;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	ptr = kaddr + bh_offset(bh);
+	if (*ptr == cpu_to_be32(GFS2_MAGIC))
+		rv = 1;
+	kunmap_atomic(kaddr, KM_USER0);
+
+	return rv;
+}
+
+/**
+ * databuf_lo_before_commit - Scan the data buffers, writing as we go
+ *
+ * Here we scan through the lists of buffers and make the assumption
+ * that any buffer thats been pinned is being journaled, and that
+ * any unpinned buffer is an ordered write data buffer and therefore
+ * will be written back rather than journaled.
+ */
+static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
+{
+	LIST_HEAD(started);
+	struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt;
+	struct buffer_head *bh = NULL;
+	unsigned int offset = sizeof(struct gfs2_log_descriptor);
+	struct gfs2_log_descriptor *ld;
+	unsigned int limit;
+	unsigned int total_dbuf = sdp->sd_log_num_databuf;
+	unsigned int total_jdata = sdp->sd_log_num_jdata;
+	unsigned int num, n;
+	__be64 *ptr = NULL;
+
+	offset += 2*sizeof(__be64) - 1;
+	offset &= ~(2*sizeof(__be64) - 1);
+	limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
+
+	/*
+	 * Start writing ordered buffers, write journaled buffers
+	 * into the log along with a header
+	 */
+	gfs2_log_lock(sdp);
+	bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf,
+				       bd_le.le_list);
+	while(total_dbuf) {
+		num = total_jdata;
+		if (num > limit)
+			num = limit;
+		n = 0;
+		list_for_each_entry_safe_continue(bd1, bdt,
+						  &sdp->sd_log_le_databuf,
+						  bd_le.le_list) {
+			/* An ordered write buffer */
+			if (bd1->bd_bh && !buffer_pinned(bd1->bd_bh)) {
+				list_move(&bd1->bd_le.le_list, &started);
+				if (bd1 == bd2) {
+					bd2 = NULL;
+					bd2 = list_prepare_entry(bd2,
+							&sdp->sd_log_le_databuf,
+							bd_le.le_list);
+				}
+				total_dbuf--;
+				if (bd1->bd_bh) {
+					get_bh(bd1->bd_bh);
+					if (buffer_dirty(bd1->bd_bh)) {
+						gfs2_log_unlock(sdp);
+						wait_on_buffer(bd1->bd_bh);
+						ll_rw_block(WRITE, 1,
+							    &bd1->bd_bh);
+						gfs2_log_lock(sdp);
+					}
+					brelse(bd1->bd_bh);
+					continue;
+				}
+				continue;
+			} else if (bd1->bd_bh) { /* A journaled buffer */
+				int magic;
+				gfs2_log_unlock(sdp);
+				if (!bh) {
+					bh = gfs2_log_get_buf(sdp);
+					sdp->sd_log_num_hdrs++;
+					ld = (struct gfs2_log_descriptor *)
+					     bh->b_data;
+					ptr = (__be64 *)(bh->b_data + offset);
+					ld->ld_header.mh_magic =
+						cpu_to_be32(GFS2_MAGIC);
+					ld->ld_header.mh_type =
+						cpu_to_be32(GFS2_METATYPE_LD);
+					ld->ld_header.mh_format =
+						cpu_to_be32(GFS2_FORMAT_LD);
+					ld->ld_type =
+						cpu_to_be32(GFS2_LOG_DESC_JDATA);
+					ld->ld_length = cpu_to_be32(num + 1);
+					ld->ld_data1 = cpu_to_be32(num);
+					ld->ld_data2 = cpu_to_be32(0);
+					memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
+				}
+				magic = gfs2_check_magic(bd1->bd_bh);
+				*ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
+				*ptr++ = cpu_to_be64((__u64)magic);
+				clear_buffer_escaped(bd1->bd_bh);
+				if (unlikely(magic != 0))
+					set_buffer_escaped(bd1->bd_bh);
+				gfs2_log_lock(sdp);
+				if (n++ > num)
+					break;
+			} else if (!bd1->bd_bh) {
+				total_dbuf--;
+				sdp->sd_log_num_databuf--;
+				list_del_init(&bd1->bd_le.le_list);
+				if (bd1 == bd2) {
+					bd2 = NULL;
+					bd2 = list_prepare_entry(bd2,
+						&sdp->sd_log_le_databuf,
+						bd_le.le_list);
+                                }
+				kmem_cache_free(gfs2_bufdata_cachep, bd1);
+			}
+		}
+		gfs2_log_unlock(sdp);
+		if (bh) {
+			set_buffer_dirty(bh);
+			ll_rw_block(WRITE, 1, &bh);
+			bh = NULL;
+		}
+		n = 0;
+		gfs2_log_lock(sdp);
+		list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf,
+					     bd_le.le_list) {
+			if (!bd2->bd_bh)
+				continue;
+			/* copy buffer if it needs escaping */
+			gfs2_log_unlock(sdp);
+			if (unlikely(buffer_escaped(bd2->bd_bh))) {
+				void *kaddr;
+				struct page *page = bd2->bd_bh->b_page;
+				bh = gfs2_log_get_buf(sdp);
+				kaddr = kmap_atomic(page, KM_USER0);
+				memcpy(bh->b_data,
+				       kaddr + bh_offset(bd2->bd_bh),
+				       sdp->sd_sb.sb_bsize);
+				kunmap_atomic(kaddr, KM_USER0);
+				*(__be32 *)bh->b_data = 0;
+			} else {
+				bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
+			}
+			set_buffer_dirty(bh);
+			ll_rw_block(WRITE, 1, &bh);
+			gfs2_log_lock(sdp);
+			if (++n >= num)
+				break;
+		}
+		bh = NULL;
+		total_dbuf -= num;
+		total_jdata -= num;
+	}
+	gfs2_log_unlock(sdp);
+
+	/* Wait on all ordered buffers */
+	while (!list_empty(&started)) {
+		gfs2_log_lock(sdp);
+		bd1 = list_entry(started.next, struct gfs2_bufdata,
+				 bd_le.le_list);
+		list_del_init(&bd1->bd_le.le_list);
+		sdp->sd_log_num_databuf--;
+		bh = bd1->bd_bh;
+		if (bh) {
+			bh->b_private = NULL;
+			get_bh(bh);
+			gfs2_log_unlock(sdp);
+			wait_on_buffer(bh);
+			brelse(bh);
+		} else
+			gfs2_log_unlock(sdp);
+
+		kmem_cache_free(gfs2_bufdata_cachep, bd1);
+	}
+
+	/* We've removed all the ordered write bufs here, so only jdata left */
+	gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata);
+}
+
+static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+				    struct gfs2_log_descriptor *ld,
+				    __be64 *ptr, int pass)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	struct gfs2_glock *gl = ip->i_gl;
+	unsigned int blks = be32_to_cpu(ld->ld_data1);
+	struct buffer_head *bh_log, *bh_ip;
+	u64 blkno;
+	u64 esc;
+	int error = 0;
+
+	if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
+		return 0;
+
+	gfs2_replay_incr_blk(sdp, &start);
+	for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+		blkno = be64_to_cpu(*ptr++);
+		esc = be64_to_cpu(*ptr++);
+
+		sdp->sd_found_blocks++;
+
+		if (gfs2_revoke_check(sdp, blkno, start))
+			continue;
+
+		error = gfs2_replay_read_block(jd, start, &bh_log);
+		if (error)
+			return error;
+
+		bh_ip = gfs2_meta_new(gl, blkno);
+		memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
+
+		/* Unescape */
+		if (esc) {
+			__be32 *eptr = (__be32 *)bh_ip->b_data;
+			*eptr = cpu_to_be32(GFS2_MAGIC);
+		}
+		mark_buffer_dirty(bh_ip);
+
+		brelse(bh_log);
+		brelse(bh_ip);
+		if (error)
+			break;
+
+		sdp->sd_replayed_blocks++;
+	}
+
+	return error;
+}
+
+/* FIXME: sort out accounting for log blocks etc. */
+
+static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+
+	if (error) {
+		gfs2_meta_sync(ip->i_gl);
+		return;
+	}
+	if (pass != 1)
+		return;
+
+	/* data sync? */
+	gfs2_meta_sync(ip->i_gl);
+
+	fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
+		jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
+}
+
+static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+	struct list_head *head = &sdp->sd_log_le_databuf;
+	struct gfs2_bufdata *bd;
+
+	while (!list_empty(head)) {
+		bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
+		list_del_init(&bd->bd_le.le_list);
+		sdp->sd_log_num_databuf--;
+		sdp->sd_log_num_jdata--;
+		gfs2_unpin(sdp, bd->bd_bh, ai);
+	}
+	gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
+	gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata);
+}
+
+
+const struct gfs2_log_operations gfs2_glock_lops = {
+	.lo_add = glock_lo_add,
+	.lo_after_commit = glock_lo_after_commit,
+	.lo_name = "glock",
+};
+
+const struct gfs2_log_operations gfs2_buf_lops = {
+	.lo_add = buf_lo_add,
+	.lo_incore_commit = buf_lo_incore_commit,
+	.lo_before_commit = buf_lo_before_commit,
+	.lo_after_commit = buf_lo_after_commit,
+	.lo_before_scan = buf_lo_before_scan,
+	.lo_scan_elements = buf_lo_scan_elements,
+	.lo_after_scan = buf_lo_after_scan,
+	.lo_name = "buf",
+};
+
+const struct gfs2_log_operations gfs2_revoke_lops = {
+	.lo_add = revoke_lo_add,
+	.lo_before_commit = revoke_lo_before_commit,
+	.lo_before_scan = revoke_lo_before_scan,
+	.lo_scan_elements = revoke_lo_scan_elements,
+	.lo_after_scan = revoke_lo_after_scan,
+	.lo_name = "revoke",
+};
+
+const struct gfs2_log_operations gfs2_rg_lops = {
+	.lo_add = rg_lo_add,
+	.lo_after_commit = rg_lo_after_commit,
+	.lo_name = "rg",
+};
+
+const struct gfs2_log_operations gfs2_databuf_lops = {
+	.lo_add = databuf_lo_add,
+	.lo_incore_commit = buf_lo_incore_commit,
+	.lo_before_commit = databuf_lo_before_commit,
+	.lo_after_commit = databuf_lo_after_commit,
+	.lo_scan_elements = databuf_lo_scan_elements,
+	.lo_after_scan = databuf_lo_after_scan,
+	.lo_name = "databuf",
+};
+
+const struct gfs2_log_operations *gfs2_log_ops[] = {
+	&gfs2_glock_lops,
+	&gfs2_buf_lops,
+	&gfs2_revoke_lops,
+	&gfs2_rg_lops,
+	&gfs2_databuf_lops,
+	NULL,
+};
+
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
new file mode 100644
index 000000000000..5839c05ae6be
--- /dev/null
+++ b/fs/gfs2/lops.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __LOPS_DOT_H__
+#define __LOPS_DOT_H__
+
+#include <linux/list.h>
+#include "incore.h"
+
+extern const struct gfs2_log_operations gfs2_glock_lops;
+extern const struct gfs2_log_operations gfs2_buf_lops;
+extern const struct gfs2_log_operations gfs2_revoke_lops;
+extern const struct gfs2_log_operations gfs2_rg_lops;
+extern const struct gfs2_log_operations gfs2_databuf_lops;
+
+extern const struct gfs2_log_operations *gfs2_log_ops[];
+
+static inline void lops_init_le(struct gfs2_log_element *le,
+				const struct gfs2_log_operations *lops)
+{
+	INIT_LIST_HEAD(&le->le_list);
+	le->le_ops = lops;
+}
+
+static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+	if (le->le_ops->lo_add)
+		le->le_ops->lo_add(sdp, le);
+}
+
+static inline void lops_incore_commit(struct gfs2_sbd *sdp,
+				      struct gfs2_trans *tr)
+{
+	int x;
+	for (x = 0; gfs2_log_ops[x]; x++)
+		if (gfs2_log_ops[x]->lo_incore_commit)
+			gfs2_log_ops[x]->lo_incore_commit(sdp, tr);
+}
+
+static inline void lops_before_commit(struct gfs2_sbd *sdp)
+{
+	int x;
+	for (x = 0; gfs2_log_ops[x]; x++)
+		if (gfs2_log_ops[x]->lo_before_commit)
+			gfs2_log_ops[x]->lo_before_commit(sdp);
+}
+
+static inline void lops_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+	int x;
+	for (x = 0; gfs2_log_ops[x]; x++)
+		if (gfs2_log_ops[x]->lo_after_commit)
+			gfs2_log_ops[x]->lo_after_commit(sdp, ai);
+}
+
+static inline void lops_before_scan(struct gfs2_jdesc *jd,
+				    struct gfs2_log_header *head,
+				    unsigned int pass)
+{
+	int x;
+	for (x = 0; gfs2_log_ops[x]; x++)
+		if (gfs2_log_ops[x]->lo_before_scan)
+			gfs2_log_ops[x]->lo_before_scan(jd, head, pass);
+}
+
+static inline int lops_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+				     struct gfs2_log_descriptor *ld,
+				     __be64 *ptr,
+				     unsigned int pass)
+{
+	int x, error;
+	for (x = 0; gfs2_log_ops[x]; x++)
+		if (gfs2_log_ops[x]->lo_scan_elements) {
+			error = gfs2_log_ops[x]->lo_scan_elements(jd, start,
+								  ld, ptr, pass);
+			if (error)
+				return error;
+		}
+
+	return 0;
+}
+
+static inline void lops_after_scan(struct gfs2_jdesc *jd, int error,
+				   unsigned int pass)
+{
+	int x;
+	for (x = 0; gfs2_log_ops[x]; x++)
+		if (gfs2_log_ops[x]->lo_before_scan)
+			gfs2_log_ops[x]->lo_after_scan(jd, error, pass);
+}
+
+#endif /* __LOPS_DOT_H__ */
+
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
new file mode 100644
index 000000000000..21508a13bb78
--- /dev/null
+++ b/fs/gfs2/main.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/atomic.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "ops_fstype.h"
+#include "sys.h"
+#include "util.h"
+#include "glock.h"
+
+static void gfs2_init_inode_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+{
+	struct gfs2_inode *ip = foo;
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR) {
+		inode_init_once(&ip->i_inode);
+		spin_lock_init(&ip->i_spin);
+		init_rwsem(&ip->i_rw_mutex);
+		memset(ip->i_cache, 0, sizeof(ip->i_cache));
+	}
+}
+
+static void gfs2_init_glock_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+{
+	struct gfs2_glock *gl = foo;
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR) {
+		INIT_HLIST_NODE(&gl->gl_list);
+		spin_lock_init(&gl->gl_spin);
+		INIT_LIST_HEAD(&gl->gl_holders);
+		INIT_LIST_HEAD(&gl->gl_waiters1);
+		INIT_LIST_HEAD(&gl->gl_waiters2);
+		INIT_LIST_HEAD(&gl->gl_waiters3);
+		gl->gl_lvb = NULL;
+		atomic_set(&gl->gl_lvb_count, 0);
+		INIT_LIST_HEAD(&gl->gl_reclaim);
+		INIT_LIST_HEAD(&gl->gl_ail_list);
+		atomic_set(&gl->gl_ail_count, 0);
+	}
+}
+
+/**
+ * init_gfs2_fs - Register GFS2 as a filesystem
+ *
+ * Returns: 0 on success, error code on failure
+ */
+
+static int __init init_gfs2_fs(void)
+{
+	int error;
+
+	error = gfs2_sys_init();
+	if (error)
+		return error;
+
+	error = gfs2_glock_init();
+	if (error)
+		goto fail;
+
+	error = -ENOMEM;
+	gfs2_glock_cachep = kmem_cache_create("gfs2_glock",
+					      sizeof(struct gfs2_glock),
+					      0, 0,
+					      gfs2_init_glock_once, NULL);
+	if (!gfs2_glock_cachep)
+		goto fail;
+
+	gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
+					      sizeof(struct gfs2_inode),
+					      0, (SLAB_RECLAIM_ACCOUNT|
+					      SLAB_PANIC|SLAB_MEM_SPREAD),
+					      gfs2_init_inode_once, NULL);
+	if (!gfs2_inode_cachep)
+		goto fail;
+
+	gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata",
+						sizeof(struct gfs2_bufdata),
+					        0, 0, NULL, NULL);
+	if (!gfs2_bufdata_cachep)
+		goto fail;
+
+	error = register_filesystem(&gfs2_fs_type);
+	if (error)
+		goto fail;
+
+	error = register_filesystem(&gfs2meta_fs_type);
+	if (error)
+		goto fail_unregister;
+
+	printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
+
+	return 0;
+
+fail_unregister:
+	unregister_filesystem(&gfs2_fs_type);
+fail:
+	if (gfs2_bufdata_cachep)
+		kmem_cache_destroy(gfs2_bufdata_cachep);
+
+	if (gfs2_inode_cachep)
+		kmem_cache_destroy(gfs2_inode_cachep);
+
+	if (gfs2_glock_cachep)
+		kmem_cache_destroy(gfs2_glock_cachep);
+
+	gfs2_sys_uninit();
+	return error;
+}
+
+/**
+ * exit_gfs2_fs - Unregister the file system
+ *
+ */
+
+static void __exit exit_gfs2_fs(void)
+{
+	unregister_filesystem(&gfs2_fs_type);
+	unregister_filesystem(&gfs2meta_fs_type);
+
+	kmem_cache_destroy(gfs2_bufdata_cachep);
+	kmem_cache_destroy(gfs2_inode_cachep);
+	kmem_cache_destroy(gfs2_glock_cachep);
+
+	gfs2_sys_uninit();
+}
+
+MODULE_DESCRIPTION("Global File System");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+module_init(init_gfs2_fs);
+module_exit(exit_gfs2_fs);
+
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
new file mode 100644
index 000000000000..3912d6a4b1e6
--- /dev/null
+++ b/fs/gfs2/meta_io.c
@@ -0,0 +1,590 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/delay.h>
+#include <linux/bio.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+#include "ops_address.h"
+
+static int aspace_get_block(struct inode *inode, sector_t lblock,
+			    struct buffer_head *bh_result, int create)
+{
+	gfs2_assert_warn(inode->i_sb->s_fs_info, 0);
+	return -EOPNOTSUPP;
+}
+
+static int gfs2_aspace_writepage(struct page *page,
+				 struct writeback_control *wbc)
+{
+	return block_write_full_page(page, aspace_get_block, wbc);
+}
+
+static const struct address_space_operations aspace_aops = {
+	.writepage = gfs2_aspace_writepage,
+	.releasepage = gfs2_releasepage,
+};
+
+/**
+ * gfs2_aspace_get - Create and initialize a struct inode structure
+ * @sdp: the filesystem the aspace is in
+ *
+ * Right now a struct inode is just a struct inode.  Maybe Linux
+ * will supply a more lightweight address space construct (that works)
+ * in the future.
+ *
+ * Make sure pages/buffers in this aspace aren't in high memory.
+ *
+ * Returns: the aspace
+ */
+
+struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
+{
+	struct inode *aspace;
+
+	aspace = new_inode(sdp->sd_vfs);
+	if (aspace) {
+		mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS);
+		aspace->i_mapping->a_ops = &aspace_aops;
+		aspace->i_size = ~0ULL;
+		aspace->i_private = NULL;
+		insert_inode_hash(aspace);
+	}
+	return aspace;
+}
+
+void gfs2_aspace_put(struct inode *aspace)
+{
+	remove_inode_hash(aspace);
+	iput(aspace);
+}
+
+/**
+ * gfs2_meta_inval - Invalidate all buffers associated with a glock
+ * @gl: the glock
+ *
+ */
+
+void gfs2_meta_inval(struct gfs2_glock *gl)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct inode *aspace = gl->gl_aspace;
+	struct address_space *mapping = gl->gl_aspace->i_mapping;
+
+	gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
+
+	atomic_inc(&aspace->i_writecount);
+	truncate_inode_pages(mapping, 0);
+	atomic_dec(&aspace->i_writecount);
+
+	gfs2_assert_withdraw(sdp, !mapping->nrpages);
+}
+
+/**
+ * gfs2_meta_sync - Sync all buffers associated with a glock
+ * @gl: The glock
+ *
+ */
+
+void gfs2_meta_sync(struct gfs2_glock *gl)
+{
+	struct address_space *mapping = gl->gl_aspace->i_mapping;
+	int error;
+
+	filemap_fdatawrite(mapping);
+	error = filemap_fdatawait(mapping);
+
+	if (error)
+		gfs2_io_error(gl->gl_sbd);
+}
+
+/**
+ * getbuf - Get a buffer with a given address space
+ * @sdp: the filesystem
+ * @aspace: the address space
+ * @blkno: the block number (filesystem scope)
+ * @create: 1 if the buffer should be created
+ *
+ * Returns: the buffer
+ */
+
+static struct buffer_head *getbuf(struct gfs2_sbd *sdp, struct inode *aspace,
+				  u64 blkno, int create)
+{
+	struct page *page;
+	struct buffer_head *bh;
+	unsigned int shift;
+	unsigned long index;
+	unsigned int bufnum;
+
+	shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
+	index = blkno >> shift;             /* convert block to page */
+	bufnum = blkno - (index << shift);  /* block buf index within page */
+
+	if (create) {
+		for (;;) {
+			page = grab_cache_page(aspace->i_mapping, index);
+			if (page)
+				break;
+			yield();
+		}
+	} else {
+		page = find_lock_page(aspace->i_mapping, index);
+		if (!page)
+			return NULL;
+	}
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0);
+
+	/* Locate header for our buffer within our page */
+	for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page)
+		/* Do nothing */;
+	get_bh(bh);
+
+	if (!buffer_mapped(bh))
+		map_bh(bh, sdp->sd_vfs, blkno);
+
+	unlock_page(page);
+	mark_page_accessed(page);
+	page_cache_release(page);
+
+	return bh;
+}
+
+static void meta_prep_new(struct buffer_head *bh)
+{
+	struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
+
+	lock_buffer(bh);
+	clear_buffer_dirty(bh);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+
+	mh->mh_magic = cpu_to_be32(GFS2_MAGIC);
+}
+
+/**
+ * gfs2_meta_new - Get a block
+ * @gl: The glock associated with this block
+ * @blkno: The block number
+ *
+ * Returns: The buffer
+ */
+
+struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
+{
+	struct buffer_head *bh;
+	bh = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
+	meta_prep_new(bh);
+	return bh;
+}
+
+/**
+ * gfs2_meta_read - Read a block from disk
+ * @gl: The glock covering the block
+ * @blkno: The block number
+ * @flags: flags
+ * @bhp: the place where the buffer is returned (NULL on failure)
+ *
+ * Returns: errno
+ */
+
+int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
+		   struct buffer_head **bhp)
+{
+	*bhp = getbuf(gl->gl_sbd, gl->gl_aspace, blkno, CREATE);
+	if (!buffer_uptodate(*bhp))
+		ll_rw_block(READ_META, 1, bhp);
+	if (flags & DIO_WAIT) {
+		int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
+		if (error) {
+			brelse(*bhp);
+			return error;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_meta_wait - Reread a block from disk
+ * @sdp: the filesystem
+ * @bh: The block to wait for
+ *
+ * Returns: errno
+ */
+
+int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
+
+	wait_on_buffer(bh);
+
+	if (!buffer_uptodate(bh)) {
+		struct gfs2_trans *tr = current->journal_info;
+		if (tr && tr->tr_touched)
+			gfs2_io_error_bh(sdp, bh);
+		return -EIO;
+	}
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
+
+	return 0;
+}
+
+/**
+ * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer
+ * @gl: the glock the buffer belongs to
+ * @bh: The buffer to be attached to
+ * @meta: Flag to indicate whether its metadata or not
+ */
+
+void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
+			 int meta)
+{
+	struct gfs2_bufdata *bd;
+
+	if (meta)
+		lock_page(bh->b_page);
+
+	if (bh->b_private) {
+		if (meta)
+			unlock_page(bh->b_page);
+		return;
+	}
+
+	bd = kmem_cache_alloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL),
+	memset(bd, 0, sizeof(struct gfs2_bufdata));
+	bd->bd_bh = bh;
+	bd->bd_gl = gl;
+
+	INIT_LIST_HEAD(&bd->bd_list_tr);
+	if (meta)
+		lops_init_le(&bd->bd_le, &gfs2_buf_lops);
+	else
+		lops_init_le(&bd->bd_le, &gfs2_databuf_lops);
+	bh->b_private = bd;
+
+	if (meta)
+		unlock_page(bh->b_page);
+}
+
+/**
+ * gfs2_pin - Pin a buffer in memory
+ * @sdp: the filesystem the buffer belongs to
+ * @bh: The buffer to be pinned
+ *
+ */
+
+void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+	struct gfs2_bufdata *bd = bh->b_private;
+
+	gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
+
+	if (test_set_buffer_pinned(bh))
+		gfs2_assert_withdraw(sdp, 0);
+
+	wait_on_buffer(bh);
+
+	/* If this buffer is in the AIL and it has already been written
+	   to in-place disk block, remove it from the AIL. */
+
+	gfs2_log_lock(sdp);
+	if (bd->bd_ail && !buffer_in_io(bh))
+		list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
+	gfs2_log_unlock(sdp);
+
+	clear_buffer_dirty(bh);
+	wait_on_buffer(bh);
+
+	if (!buffer_uptodate(bh))
+		gfs2_io_error_bh(sdp, bh);
+
+	get_bh(bh);
+}
+
+/**
+ * gfs2_unpin - Unpin a buffer
+ * @sdp: the filesystem the buffer belongs to
+ * @bh: The buffer to unpin
+ * @ai:
+ *
+ */
+
+void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
+	        struct gfs2_ail *ai)
+{
+	struct gfs2_bufdata *bd = bh->b_private;
+
+	gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
+
+	if (!buffer_pinned(bh))
+		gfs2_assert_withdraw(sdp, 0);
+
+	mark_buffer_dirty(bh);
+	clear_buffer_pinned(bh);
+
+	gfs2_log_lock(sdp);
+	if (bd->bd_ail) {
+		list_del(&bd->bd_ail_st_list);
+		brelse(bh);
+	} else {
+		struct gfs2_glock *gl = bd->bd_gl;
+		list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
+		atomic_inc(&gl->gl_ail_count);
+	}
+	bd->bd_ail = ai;
+	list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+	gfs2_log_unlock(sdp);
+}
+
+/**
+ * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore
+ * @ip: the inode who owns the buffers
+ * @bstart: the first buffer in the run
+ * @blen: the number of buffers in the run
+ *
+ */
+
+void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct inode *aspace = ip->i_gl->gl_aspace;
+	struct buffer_head *bh;
+
+	while (blen) {
+		bh = getbuf(sdp, aspace, bstart, NO_CREATE);
+		if (bh) {
+			struct gfs2_bufdata *bd = bh->b_private;
+
+			if (test_clear_buffer_pinned(bh)) {
+				struct gfs2_trans *tr = current->journal_info;
+				gfs2_log_lock(sdp);
+				list_del_init(&bd->bd_le.le_list);
+				gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
+				sdp->sd_log_num_buf--;
+				gfs2_log_unlock(sdp);
+				tr->tr_num_buf_rm++;
+				brelse(bh);
+			}
+			if (bd) {
+				gfs2_log_lock(sdp);
+				if (bd->bd_ail) {
+					u64 blkno = bh->b_blocknr;
+					bd->bd_ail = NULL;
+					list_del(&bd->bd_ail_st_list);
+					list_del(&bd->bd_ail_gl_list);
+					atomic_dec(&bd->bd_gl->gl_ail_count);
+					brelse(bh);
+					gfs2_log_unlock(sdp);
+					gfs2_trans_add_revoke(sdp, blkno);
+				} else
+					gfs2_log_unlock(sdp);
+			}
+
+			lock_buffer(bh);
+			clear_buffer_dirty(bh);
+			clear_buffer_uptodate(bh);
+			unlock_buffer(bh);
+
+			brelse(bh);
+		}
+
+		bstart++;
+		blen--;
+	}
+}
+
+/**
+ * gfs2_meta_cache_flush - get rid of any references on buffers for this inode
+ * @ip: The GFS2 inode
+ *
+ * This releases buffers that are in the most-recently-used array of
+ * blocks used for indirect block addressing for this inode.
+ */
+
+void gfs2_meta_cache_flush(struct gfs2_inode *ip)
+{
+	struct buffer_head **bh_slot;
+	unsigned int x;
+
+	spin_lock(&ip->i_spin);
+
+	for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
+		bh_slot = &ip->i_cache[x];
+		if (!*bh_slot)
+			break;
+		brelse(*bh_slot);
+		*bh_slot = NULL;
+	}
+
+	spin_unlock(&ip->i_spin);
+}
+
+/**
+ * gfs2_meta_indirect_buffer - Get a metadata buffer
+ * @ip: The GFS2 inode
+ * @height: The level of this buf in the metadata (indir addr) tree (if any)
+ * @num: The block number (device relative) of the buffer
+ * @new: Non-zero if we may create a new buffer
+ * @bhp: the buffer is returned here
+ *
+ * Try to use the gfs2_inode's MRU metadata tree cache.
+ *
+ * Returns: errno
+ */
+
+int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
+			      int new, struct buffer_head **bhp)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_glock *gl = ip->i_gl;
+	struct buffer_head *bh = NULL, **bh_slot = ip->i_cache + height;
+	int in_cache = 0;
+
+	spin_lock(&ip->i_spin);
+	if (*bh_slot && (*bh_slot)->b_blocknr == num) {
+		bh = *bh_slot;
+		get_bh(bh);
+		in_cache = 1;
+	}
+	spin_unlock(&ip->i_spin);
+
+	if (!bh)
+		bh = getbuf(gl->gl_sbd, gl->gl_aspace, num, CREATE);
+
+	if (!bh)
+		return -ENOBUFS;
+
+	if (new) {
+		if (gfs2_assert_warn(sdp, height))
+			goto err;
+		meta_prep_new(bh);
+		gfs2_trans_add_bh(ip->i_gl, bh, 1);
+		gfs2_metatype_set(bh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
+		gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
+	} else {
+		u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
+		if (!buffer_uptodate(bh)) {
+			ll_rw_block(READ_META, 1, &bh);
+			if (gfs2_meta_wait(sdp, bh))
+				goto err;
+		}
+		if (gfs2_metatype_check(sdp, bh, mtype))
+			goto err;
+	}
+
+	if (!in_cache) {
+		spin_lock(&ip->i_spin);
+		if (*bh_slot)
+			brelse(*bh_slot);
+		*bh_slot = bh;
+		get_bh(bh);
+		spin_unlock(&ip->i_spin);
+	}
+
+	*bhp = bh;
+	return 0;
+err:
+	brelse(bh);
+	return -EIO;
+}
+
+/**
+ * gfs2_meta_ra - start readahead on an extent of a file
+ * @gl: the glock the blocks belong to
+ * @dblock: the starting disk block
+ * @extlen: the number of blocks in the extent
+ *
+ * returns: the first buffer in the extent
+ */
+
+struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct inode *aspace = gl->gl_aspace;
+	struct buffer_head *first_bh, *bh;
+	u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >>
+			  sdp->sd_sb.sb_bsize_shift;
+
+	BUG_ON(!extlen);
+
+	if (max_ra < 1)
+		max_ra = 1;
+	if (extlen > max_ra)
+		extlen = max_ra;
+
+	first_bh = getbuf(sdp, aspace, dblock, CREATE);
+
+	if (buffer_uptodate(first_bh))
+		goto out;
+	if (!buffer_locked(first_bh))
+		ll_rw_block(READ_META, 1, &first_bh);
+
+	dblock++;
+	extlen--;
+
+	while (extlen) {
+		bh = getbuf(sdp, aspace, dblock, CREATE);
+
+		if (!buffer_uptodate(bh) && !buffer_locked(bh))
+			ll_rw_block(READA, 1, &bh);
+		brelse(bh);
+		dblock++;
+		extlen--;
+		if (!buffer_locked(first_bh) && buffer_uptodate(first_bh))
+			goto out;
+	}
+
+	wait_on_buffer(first_bh);
+out:
+	return first_bh;
+}
+
+/**
+ * gfs2_meta_syncfs - sync all the buffers in a filesystem
+ * @sdp: the filesystem
+ *
+ */
+
+void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
+{
+	gfs2_log_flush(sdp, NULL);
+	for (;;) {
+		gfs2_ail1_start(sdp, DIO_ALL);
+		if (gfs2_ail1_empty(sdp, DIO_ALL))
+			break;
+		msleep(10);
+	}
+}
+
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
new file mode 100644
index 000000000000..3ec939e20dff
--- /dev/null
+++ b/fs/gfs2/meta_io.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __DIO_DOT_H__
+#define __DIO_DOT_H__
+
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include "incore.h"
+
+static inline void gfs2_buffer_clear(struct buffer_head *bh)
+{
+	memset(bh->b_data, 0, bh->b_size);
+}
+
+static inline void gfs2_buffer_clear_tail(struct buffer_head *bh, int head)
+{
+	BUG_ON(head > bh->b_size);
+	memset(bh->b_data + head, 0, bh->b_size - head);
+}
+
+static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
+					 int to_head,
+					 struct buffer_head *from_bh,
+					 int from_head)
+{
+	BUG_ON(from_head < to_head);
+	memcpy(to_bh->b_data + to_head, from_bh->b_data + from_head,
+	       from_bh->b_size - from_head);
+	memset(to_bh->b_data + to_bh->b_size + to_head - from_head,
+	       0, from_head - to_head);
+}
+
+struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
+void gfs2_aspace_put(struct inode *aspace);
+
+void gfs2_meta_inval(struct gfs2_glock *gl);
+void gfs2_meta_sync(struct gfs2_glock *gl);
+
+struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
+int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
+		   int flags, struct buffer_head **bhp);
+int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+
+void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
+			 int meta);
+void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
+void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
+		struct gfs2_ail *ai);
+
+void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
+
+void gfs2_meta_cache_flush(struct gfs2_inode *ip);
+int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
+			      int new, struct buffer_head **bhp);
+
+static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip,
+					 struct buffer_head **bhp)
+{
+	return gfs2_meta_indirect_buffer(ip, 0, ip->i_num.no_addr, 0, bhp);
+}
+
+struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen);
+void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
+
+#define buffer_busy(bh) \
+((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned)))
+#define buffer_in_io(bh) \
+((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock)))
+
+#endif /* __DIO_DOT_H__ */
+
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
new file mode 100644
index 000000000000..ef3092e29607
--- /dev/null
+++ b/fs/gfs2/mount.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "mount.h"
+#include "sys.h"
+#include "util.h"
+
+/**
+ * gfs2_mount_args - Parse mount options
+ * @sdp:
+ * @data:
+ *
+ * Return: errno
+ */
+
+int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
+{
+	struct gfs2_args *args = &sdp->sd_args;
+	char *data = data_arg;
+	char *options, *o, *v;
+	int error = 0;
+
+	if (!remount) {
+		/*  If someone preloaded options, use those instead  */
+		spin_lock(&gfs2_sys_margs_lock);
+		if (gfs2_sys_margs) {
+			data = gfs2_sys_margs;
+			gfs2_sys_margs = NULL;
+		}
+		spin_unlock(&gfs2_sys_margs_lock);
+
+		/*  Set some defaults  */
+		args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
+		args->ar_quota = GFS2_QUOTA_DEFAULT;
+		args->ar_data = GFS2_DATA_DEFAULT;
+	}
+
+	/* Split the options into tokens with the "," character and
+	   process them */
+
+	for (options = data; (o = strsep(&options, ",")); ) {
+		if (!*o)
+			continue;
+
+		v = strchr(o, '=');
+		if (v)
+			*v++ = 0;
+
+		if (!strcmp(o, "lockproto")) {
+			if (!v)
+				goto need_value;
+			if (remount && strcmp(v, args->ar_lockproto))
+				goto cant_remount;
+			strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
+			args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
+		}
+
+		else if (!strcmp(o, "locktable")) {
+			if (!v)
+				goto need_value;
+			if (remount && strcmp(v, args->ar_locktable))
+				goto cant_remount;
+			strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
+			args->ar_locktable[GFS2_LOCKNAME_LEN - 1] = 0;
+		}
+
+		else if (!strcmp(o, "hostdata")) {
+			if (!v)
+				goto need_value;
+			if (remount && strcmp(v, args->ar_hostdata))
+				goto cant_remount;
+			strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
+			args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
+		}
+
+		else if (!strcmp(o, "spectator")) {
+			if (remount && !args->ar_spectator)
+				goto cant_remount;
+			args->ar_spectator = 1;
+			sdp->sd_vfs->s_flags |= MS_RDONLY;
+		}
+
+		else if (!strcmp(o, "ignore_local_fs")) {
+			if (remount && !args->ar_ignore_local_fs)
+				goto cant_remount;
+			args->ar_ignore_local_fs = 1;
+		}
+
+		else if (!strcmp(o, "localflocks")) {
+			if (remount && !args->ar_localflocks)
+				goto cant_remount;
+			args->ar_localflocks = 1;
+		}
+
+		else if (!strcmp(o, "localcaching")) {
+			if (remount && !args->ar_localcaching)
+				goto cant_remount;
+			args->ar_localcaching = 1;
+		}
+
+		else if (!strcmp(o, "debug"))
+			args->ar_debug = 1;
+
+		else if (!strcmp(o, "nodebug"))
+			args->ar_debug = 0;
+
+		else if (!strcmp(o, "upgrade")) {
+			if (remount && !args->ar_upgrade)
+				goto cant_remount;
+			args->ar_upgrade = 1;
+		}
+
+		else if (!strcmp(o, "num_glockd")) {
+			unsigned int x;
+			if (!v)
+				goto need_value;
+			sscanf(v, "%u", &x);
+			if (remount && x != args->ar_num_glockd)
+				goto cant_remount;
+			if (!x || x > GFS2_GLOCKD_MAX) {
+				fs_info(sdp, "0 < num_glockd <= %u  (not %u)\n",
+				        GFS2_GLOCKD_MAX, x);
+				error = -EINVAL;
+				break;
+			}
+			args->ar_num_glockd = x;
+		}
+
+		else if (!strcmp(o, "acl")) {
+			args->ar_posix_acl = 1;
+			sdp->sd_vfs->s_flags |= MS_POSIXACL;
+		}
+
+		else if (!strcmp(o, "noacl")) {
+			args->ar_posix_acl = 0;
+			sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
+		}
+
+		else if (!strcmp(o, "quota")) {
+			if (!v)
+				goto need_value;
+			if (!strcmp(v, "off"))
+				args->ar_quota = GFS2_QUOTA_OFF;
+			else if (!strcmp(v, "account"))
+				args->ar_quota = GFS2_QUOTA_ACCOUNT;
+			else if (!strcmp(v, "on"))
+				args->ar_quota = GFS2_QUOTA_ON;
+			else {
+				fs_info(sdp, "invalid value for quota\n");
+				error = -EINVAL;
+				break;
+			}
+		}
+
+		else if (!strcmp(o, "suiddir"))
+			args->ar_suiddir = 1;
+
+		else if (!strcmp(o, "nosuiddir"))
+			args->ar_suiddir = 0;
+
+		else if (!strcmp(o, "data")) {
+			if (!v)
+				goto need_value;
+			if (!strcmp(v, "writeback"))
+				args->ar_data = GFS2_DATA_WRITEBACK;
+			else if (!strcmp(v, "ordered"))
+				args->ar_data = GFS2_DATA_ORDERED;
+			else {
+				fs_info(sdp, "invalid value for data\n");
+				error = -EINVAL;
+				break;
+			}
+		}
+
+		else {
+			fs_info(sdp, "unknown option: %s\n", o);
+			error = -EINVAL;
+			break;
+		}
+	}
+
+	if (error)
+		fs_info(sdp, "invalid mount option(s)\n");
+
+	if (data != data_arg)
+		kfree(data);
+
+	return error;
+
+need_value:
+	fs_info(sdp, "need value for option %s\n", o);
+	return -EINVAL;
+
+cant_remount:
+	fs_info(sdp, "can't remount with option %s\n", o);
+	return -EINVAL;
+}
+
diff --git a/fs/gfs2/mount.h b/fs/gfs2/mount.h
new file mode 100644
index 000000000000..401288acfdf3
--- /dev/null
+++ b/fs/gfs2/mount.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __MOUNT_DOT_H__
+#define __MOUNT_DOT_H__
+
+struct gfs2_sbd;
+
+int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount);
+
+#endif /* __MOUNT_DOT_H__ */
diff --git a/fs/gfs2/ondisk.c b/fs/gfs2/ondisk.c
new file mode 100644
index 000000000000..1025960b0e6e
--- /dev/null
+++ b/fs/gfs2/ondisk.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+
+#include "gfs2.h"
+#include <linux/gfs2_ondisk.h>
+
+#define pv(struct, member, fmt) printk(KERN_INFO "  "#member" = "fmt"\n", \
+				       struct->member);
+
+/*
+ * gfs2_xxx_in - read in an xxx struct
+ * first arg: the cpu-order structure
+ * buf: the disk-order buffer
+ *
+ * gfs2_xxx_out - write out an xxx struct
+ * first arg: the cpu-order structure
+ * buf: the disk-order buffer
+ *
+ * gfs2_xxx_print - print out an xxx struct
+ * first arg: the cpu-order structure
+ */
+
+void gfs2_inum_in(struct gfs2_inum *no, const void *buf)
+{
+	const struct gfs2_inum *str = buf;
+
+	no->no_formal_ino = be64_to_cpu(str->no_formal_ino);
+	no->no_addr = be64_to_cpu(str->no_addr);
+}
+
+void gfs2_inum_out(const struct gfs2_inum *no, void *buf)
+{
+	struct gfs2_inum *str = buf;
+
+	str->no_formal_ino = cpu_to_be64(no->no_formal_ino);
+	str->no_addr = cpu_to_be64(no->no_addr);
+}
+
+static void gfs2_inum_print(const struct gfs2_inum *no)
+{
+	printk(KERN_INFO "  no_formal_ino = %llu\n", (unsigned long long)no->no_formal_ino);
+	printk(KERN_INFO "  no_addr = %llu\n", (unsigned long long)no->no_addr);
+}
+
+static void gfs2_meta_header_in(struct gfs2_meta_header *mh, const void *buf)
+{
+	const struct gfs2_meta_header *str = buf;
+
+	mh->mh_magic = be32_to_cpu(str->mh_magic);
+	mh->mh_type = be32_to_cpu(str->mh_type);
+	mh->mh_format = be32_to_cpu(str->mh_format);
+}
+
+static void gfs2_meta_header_out(const struct gfs2_meta_header *mh, void *buf)
+{
+	struct gfs2_meta_header *str = buf;
+
+	str->mh_magic = cpu_to_be32(mh->mh_magic);
+	str->mh_type = cpu_to_be32(mh->mh_type);
+	str->mh_format = cpu_to_be32(mh->mh_format);
+}
+
+static void gfs2_meta_header_print(const struct gfs2_meta_header *mh)
+{
+	pv(mh, mh_magic, "0x%.8X");
+	pv(mh, mh_type, "%u");
+	pv(mh, mh_format, "%u");
+}
+
+void gfs2_sb_in(struct gfs2_sb *sb, const void *buf)
+{
+	const struct gfs2_sb *str = buf;
+
+	gfs2_meta_header_in(&sb->sb_header, buf);
+
+	sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
+	sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
+	sb->sb_bsize = be32_to_cpu(str->sb_bsize);
+	sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
+
+	gfs2_inum_in(&sb->sb_master_dir, (char *)&str->sb_master_dir);
+	gfs2_inum_in(&sb->sb_root_dir, (char *)&str->sb_root_dir);
+
+	memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
+	memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
+}
+
+void gfs2_rindex_in(struct gfs2_rindex *ri, const void *buf)
+{
+	const struct gfs2_rindex *str = buf;
+
+	ri->ri_addr = be64_to_cpu(str->ri_addr);
+	ri->ri_length = be32_to_cpu(str->ri_length);
+	ri->ri_data0 = be64_to_cpu(str->ri_data0);
+	ri->ri_data = be32_to_cpu(str->ri_data);
+	ri->ri_bitbytes = be32_to_cpu(str->ri_bitbytes);
+
+}
+
+void gfs2_rindex_print(const struct gfs2_rindex *ri)
+{
+	printk(KERN_INFO "  ri_addr = %llu\n", (unsigned long long)ri->ri_addr);
+	pv(ri, ri_length, "%u");
+
+	printk(KERN_INFO "  ri_data0 = %llu\n", (unsigned long long)ri->ri_data0);
+	pv(ri, ri_data, "%u");
+
+	pv(ri, ri_bitbytes, "%u");
+}
+
+void gfs2_rgrp_in(struct gfs2_rgrp *rg, const void *buf)
+{
+	const struct gfs2_rgrp *str = buf;
+
+	gfs2_meta_header_in(&rg->rg_header, buf);
+	rg->rg_flags = be32_to_cpu(str->rg_flags);
+	rg->rg_free = be32_to_cpu(str->rg_free);
+	rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
+	rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
+}
+
+void gfs2_rgrp_out(const struct gfs2_rgrp *rg, void *buf)
+{
+	struct gfs2_rgrp *str = buf;
+
+	gfs2_meta_header_out(&rg->rg_header, buf);
+	str->rg_flags = cpu_to_be32(rg->rg_flags);
+	str->rg_free = cpu_to_be32(rg->rg_free);
+	str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
+	str->__pad = cpu_to_be32(0);
+	str->rg_igeneration = cpu_to_be64(rg->rg_igeneration);
+	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
+}
+
+void gfs2_quota_in(struct gfs2_quota *qu, const void *buf)
+{
+	const struct gfs2_quota *str = buf;
+
+	qu->qu_limit = be64_to_cpu(str->qu_limit);
+	qu->qu_warn = be64_to_cpu(str->qu_warn);
+	qu->qu_value = be64_to_cpu(str->qu_value);
+}
+
+void gfs2_dinode_in(struct gfs2_dinode *di, const void *buf)
+{
+	const struct gfs2_dinode *str = buf;
+
+	gfs2_meta_header_in(&di->di_header, buf);
+	gfs2_inum_in(&di->di_num, &str->di_num);
+
+	di->di_mode = be32_to_cpu(str->di_mode);
+	di->di_uid = be32_to_cpu(str->di_uid);
+	di->di_gid = be32_to_cpu(str->di_gid);
+	di->di_nlink = be32_to_cpu(str->di_nlink);
+	di->di_size = be64_to_cpu(str->di_size);
+	di->di_blocks = be64_to_cpu(str->di_blocks);
+	di->di_atime = be64_to_cpu(str->di_atime);
+	di->di_mtime = be64_to_cpu(str->di_mtime);
+	di->di_ctime = be64_to_cpu(str->di_ctime);
+	di->di_major = be32_to_cpu(str->di_major);
+	di->di_minor = be32_to_cpu(str->di_minor);
+
+	di->di_goal_meta = be64_to_cpu(str->di_goal_meta);
+	di->di_goal_data = be64_to_cpu(str->di_goal_data);
+	di->di_generation = be64_to_cpu(str->di_generation);
+
+	di->di_flags = be32_to_cpu(str->di_flags);
+	di->di_payload_format = be32_to_cpu(str->di_payload_format);
+	di->di_height = be16_to_cpu(str->di_height);
+
+	di->di_depth = be16_to_cpu(str->di_depth);
+	di->di_entries = be32_to_cpu(str->di_entries);
+
+	di->di_eattr = be64_to_cpu(str->di_eattr);
+
+}
+
+void gfs2_dinode_out(const struct gfs2_dinode *di, void *buf)
+{
+	struct gfs2_dinode *str = buf;
+
+	gfs2_meta_header_out(&di->di_header, buf);
+	gfs2_inum_out(&di->di_num, (char *)&str->di_num);
+
+	str->di_mode = cpu_to_be32(di->di_mode);
+	str->di_uid = cpu_to_be32(di->di_uid);
+	str->di_gid = cpu_to_be32(di->di_gid);
+	str->di_nlink = cpu_to_be32(di->di_nlink);
+	str->di_size = cpu_to_be64(di->di_size);
+	str->di_blocks = cpu_to_be64(di->di_blocks);
+	str->di_atime = cpu_to_be64(di->di_atime);
+	str->di_mtime = cpu_to_be64(di->di_mtime);
+	str->di_ctime = cpu_to_be64(di->di_ctime);
+	str->di_major = cpu_to_be32(di->di_major);
+	str->di_minor = cpu_to_be32(di->di_minor);
+
+	str->di_goal_meta = cpu_to_be64(di->di_goal_meta);
+	str->di_goal_data = cpu_to_be64(di->di_goal_data);
+	str->di_generation = cpu_to_be64(di->di_generation);
+
+	str->di_flags = cpu_to_be32(di->di_flags);
+	str->di_payload_format = cpu_to_be32(di->di_payload_format);
+	str->di_height = cpu_to_be16(di->di_height);
+
+	str->di_depth = cpu_to_be16(di->di_depth);
+	str->di_entries = cpu_to_be32(di->di_entries);
+
+	str->di_eattr = cpu_to_be64(di->di_eattr);
+
+}
+
+void gfs2_dinode_print(const struct gfs2_dinode *di)
+{
+	gfs2_meta_header_print(&di->di_header);
+	gfs2_inum_print(&di->di_num);
+
+	pv(di, di_mode, "0%o");
+	pv(di, di_uid, "%u");
+	pv(di, di_gid, "%u");
+	pv(di, di_nlink, "%u");
+	printk(KERN_INFO "  di_size = %llu\n", (unsigned long long)di->di_size);
+	printk(KERN_INFO "  di_blocks = %llu\n", (unsigned long long)di->di_blocks);
+	printk(KERN_INFO "  di_atime = %lld\n", (long long)di->di_atime);
+	printk(KERN_INFO "  di_mtime = %lld\n", (long long)di->di_mtime);
+	printk(KERN_INFO "  di_ctime = %lld\n", (long long)di->di_ctime);
+	pv(di, di_major, "%u");
+	pv(di, di_minor, "%u");
+
+	printk(KERN_INFO "  di_goal_meta = %llu\n", (unsigned long long)di->di_goal_meta);
+	printk(KERN_INFO "  di_goal_data = %llu\n", (unsigned long long)di->di_goal_data);
+
+	pv(di, di_flags, "0x%.8X");
+	pv(di, di_payload_format, "%u");
+	pv(di, di_height, "%u");
+
+	pv(di, di_depth, "%u");
+	pv(di, di_entries, "%u");
+
+	printk(KERN_INFO "  di_eattr = %llu\n", (unsigned long long)di->di_eattr);
+}
+
+void gfs2_log_header_in(struct gfs2_log_header *lh, const void *buf)
+{
+	const struct gfs2_log_header *str = buf;
+
+	gfs2_meta_header_in(&lh->lh_header, buf);
+	lh->lh_sequence = be64_to_cpu(str->lh_sequence);
+	lh->lh_flags = be32_to_cpu(str->lh_flags);
+	lh->lh_tail = be32_to_cpu(str->lh_tail);
+	lh->lh_blkno = be32_to_cpu(str->lh_blkno);
+	lh->lh_hash = be32_to_cpu(str->lh_hash);
+}
+
+void gfs2_inum_range_in(struct gfs2_inum_range *ir, const void *buf)
+{
+	const struct gfs2_inum_range *str = buf;
+
+	ir->ir_start = be64_to_cpu(str->ir_start);
+	ir->ir_length = be64_to_cpu(str->ir_length);
+}
+
+void gfs2_inum_range_out(const struct gfs2_inum_range *ir, void *buf)
+{
+	struct gfs2_inum_range *str = buf;
+
+	str->ir_start = cpu_to_be64(ir->ir_start);
+	str->ir_length = cpu_to_be64(ir->ir_length);
+}
+
+void gfs2_statfs_change_in(struct gfs2_statfs_change *sc, const void *buf)
+{
+	const struct gfs2_statfs_change *str = buf;
+
+	sc->sc_total = be64_to_cpu(str->sc_total);
+	sc->sc_free = be64_to_cpu(str->sc_free);
+	sc->sc_dinodes = be64_to_cpu(str->sc_dinodes);
+}
+
+void gfs2_statfs_change_out(const struct gfs2_statfs_change *sc, void *buf)
+{
+	struct gfs2_statfs_change *str = buf;
+
+	str->sc_total = cpu_to_be64(sc->sc_total);
+	str->sc_free = cpu_to_be64(sc->sc_free);
+	str->sc_dinodes = cpu_to_be64(sc->sc_dinodes);
+}
+
+void gfs2_quota_change_in(struct gfs2_quota_change *qc, const void *buf)
+{
+	const struct gfs2_quota_change *str = buf;
+
+	qc->qc_change = be64_to_cpu(str->qc_change);
+	qc->qc_flags = be32_to_cpu(str->qc_flags);
+	qc->qc_id = be32_to_cpu(str->qc_id);
+}
+
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
new file mode 100644
index 000000000000..8d5963c7e123
--- /dev/null
+++ b/fs/gfs2/ops_address.c
@@ -0,0 +1,793 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/fs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "ops_address.h"
+#include "quota.h"
+#include "trans.h"
+#include "rgrp.h"
+#include "ops_file.h"
+#include "util.h"
+#include "glops.h"
+
+
+static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+				   unsigned int from, unsigned int to)
+{
+	struct buffer_head *head = page_buffers(page);
+	unsigned int bsize = head->b_size;
+	struct buffer_head *bh;
+	unsigned int start, end;
+
+	for (bh = head, start = 0; bh != head || !start;
+	     bh = bh->b_this_page, start = end) {
+		end = start + bsize;
+		if (end <= from || start >= to)
+			continue;
+		gfs2_trans_add_bh(ip->i_gl, bh, 0);
+	}
+}
+
+/**
+ * gfs2_get_block - Fills in a buffer head with details about a block
+ * @inode: The inode
+ * @lblock: The block number to look up
+ * @bh_result: The buffer head to return the result in
+ * @create: Non-zero if we may add block to the file
+ *
+ * Returns: errno
+ */
+
+int gfs2_get_block(struct inode *inode, sector_t lblock,
+	           struct buffer_head *bh_result, int create)
+{
+	return gfs2_block_map(inode, lblock, create, bh_result);
+}
+
+/**
+ * gfs2_get_block_noalloc - Fills in a buffer head with details about a block
+ * @inode: The inode
+ * @lblock: The block number to look up
+ * @bh_result: The buffer head to return the result in
+ * @create: Non-zero if we may add block to the file
+ *
+ * Returns: errno
+ */
+
+static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
+				  struct buffer_head *bh_result, int create)
+{
+	int error;
+
+	error = gfs2_block_map(inode, lblock, 0, bh_result);
+	if (error)
+		return error;
+	if (bh_result->b_blocknr == 0)
+		return -EIO;
+	return 0;
+}
+
+static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
+				 struct buffer_head *bh_result, int create)
+{
+	return gfs2_block_map(inode, lblock, 0, bh_result);
+}
+
+/**
+ * gfs2_writepage - Write complete page
+ * @page: Page to write
+ *
+ * Returns: errno
+ *
+ * Some of this is copied from block_write_full_page() although we still
+ * call it to do most of the work.
+ */
+
+static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+	int error;
+	int done_trans = 0;
+
+	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) {
+		unlock_page(page);
+		return -EIO;
+	}
+	if (current->journal_info)
+		goto out_ignore;
+
+	/* Is the page fully outside i_size? (truncate in progress) */
+        offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (page->index > end_index || (page->index == end_index && !offset)) {
+		page->mapping->a_ops->invalidatepage(page, 0);
+		unlock_page(page);
+		return 0; /* don't care */
+	}
+
+	if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) {
+		error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
+		if (error)
+			goto out_ignore;
+		if (!page_has_buffers(page)) {
+			create_empty_buffers(page, inode->i_sb->s_blocksize,
+					     (1 << BH_Dirty)|(1 << BH_Uptodate));
+		}
+		gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
+		done_trans = 1;
+	}
+	error = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+	if (done_trans)
+		gfs2_trans_end(sdp);
+	gfs2_meta_cache_flush(ip);
+	return error;
+
+out_ignore:
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+	return 0;
+}
+
+static int zero_readpage(struct page *page)
+{
+	void *kaddr;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr, 0, PAGE_CACHE_SIZE);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	SetPageUptodate(page);
+
+	return 0;
+}
+
+/**
+ * stuffed_readpage - Fill in a Linux page with stuffed file data
+ * @ip: the inode
+ * @page: the page
+ *
+ * Returns: errno
+ */
+
+static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
+{
+	struct buffer_head *dibh;
+	void *kaddr;
+	int error;
+
+	/* Only the first page of a stuffed file might contain data */
+	if (unlikely(page->index))
+		return zero_readpage(page);
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		return error;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
+	       ip->i_di.di_size);
+	memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	brelse(dibh);
+
+	SetPageUptodate(page);
+
+	return 0;
+}
+
+
+/**
+ * gfs2_readpage - readpage with locking
+ * @file: The file to read a page for. N.B. This may be NULL if we are
+ * reading an internal file.
+ * @page: The page to read
+ *
+ * Returns: errno
+ */
+
+static int gfs2_readpage(struct file *file, struct page *page)
+{
+	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+	struct gfs2_file *gf = NULL;
+	struct gfs2_holder gh;
+	int error;
+	int do_unlock = 0;
+
+	if (likely(file != &gfs2_internal_file_sentinel)) {
+		if (file) {
+			gf = file->private_data;
+			if (test_bit(GFF_EXLOCK, &gf->f_flags))
+				/* gfs2_sharewrite_nopage has grabbed the ip->i_gl already */
+				goto skip_lock;
+		}
+		gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|GL_AOP, &gh);
+		do_unlock = 1;
+		error = gfs2_glock_nq_m_atime(1, &gh);
+		if (unlikely(error))
+			goto out_unlock;
+	}
+
+skip_lock:
+	if (gfs2_is_stuffed(ip)) {
+		error = stuffed_readpage(ip, page);
+		unlock_page(page);
+	} else
+		error = mpage_readpage(page, gfs2_get_block);
+
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		error = -EIO;
+
+	if (do_unlock) {
+		gfs2_glock_dq_m(1, &gh);
+		gfs2_holder_uninit(&gh);
+	}
+out:
+	return error;
+out_unlock:
+	unlock_page(page);
+	if (do_unlock)
+		gfs2_holder_uninit(&gh);
+	goto out;
+}
+
+/**
+ * gfs2_readpages - Read a bunch of pages at once
+ *
+ * Some notes:
+ * 1. This is only for readahead, so we can simply ignore any things
+ *    which are slightly inconvenient (such as locking conflicts between
+ *    the page lock and the glock) and return having done no I/O. Its
+ *    obviously not something we'd want to do on too regular a basis.
+ *    Any I/O we ignore at this time will be done via readpage later.
+ * 2. We have to handle stuffed files here too.
+ * 3. mpage_readpages() does most of the heavy lifting in the common case.
+ * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places.
+ * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
+ *    well as read-ahead.
+ */
+static int gfs2_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *pages, unsigned nr_pages)
+{
+	struct inode *inode = mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_holder gh;
+	unsigned page_idx;
+	int ret;
+	int do_unlock = 0;
+
+	if (likely(file != &gfs2_internal_file_sentinel)) {
+		if (file) {
+			struct gfs2_file *gf = file->private_data;
+			if (test_bit(GFF_EXLOCK, &gf->f_flags))
+				goto skip_lock;
+		}
+		gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
+				 LM_FLAG_TRY_1CB|GL_ATIME|GL_AOP, &gh);
+		do_unlock = 1;
+		ret = gfs2_glock_nq_m_atime(1, &gh);
+		if (ret == GLR_TRYFAILED)
+			goto out_noerror;
+		if (unlikely(ret))
+			goto out_unlock;
+	}
+skip_lock:
+	if (gfs2_is_stuffed(ip)) {
+		struct pagevec lru_pvec;
+		pagevec_init(&lru_pvec, 0);
+		for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+			struct page *page = list_entry(pages->prev, struct page, lru);
+			prefetchw(&page->flags);
+			list_del(&page->lru);
+			if (!add_to_page_cache(page, mapping,
+					       page->index, GFP_KERNEL)) {
+				ret = stuffed_readpage(ip, page);
+				unlock_page(page);
+				if (!pagevec_add(&lru_pvec, page))
+					 __pagevec_lru_add(&lru_pvec);
+			} else {
+				page_cache_release(page);
+			}
+		}
+		pagevec_lru_add(&lru_pvec);
+		ret = 0;
+	} else {
+		/* What we really want to do .... */
+		ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
+	}
+
+	if (do_unlock) {
+		gfs2_glock_dq_m(1, &gh);
+		gfs2_holder_uninit(&gh);
+	}
+out:
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		ret = -EIO;
+	return ret;
+out_noerror:
+	ret = 0;
+out_unlock:
+	/* unlock all pages, we can't do any I/O right now */
+	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+		struct page *page = list_entry(pages->prev, struct page, lru);
+		list_del(&page->lru);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	if (do_unlock)
+		gfs2_holder_uninit(&gh);
+	goto out;
+}
+
+/**
+ * gfs2_prepare_write - Prepare to write a page to a file
+ * @file: The file to write to
+ * @page: The page which is to be prepared for writing
+ * @from: From (byte range within page)
+ * @to: To (byte range within page)
+ *
+ * Returns: errno
+ */
+
+static int gfs2_prepare_write(struct file *file, struct page *page,
+			      unsigned from, unsigned to)
+{
+	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+	unsigned int data_blocks, ind_blocks, rblocks;
+	int alloc_required;
+	int error = 0;
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + from;
+	loff_t end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	struct gfs2_alloc *al;
+	unsigned int write_len = to - from;
+
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|GL_AOP, &ip->i_gh);
+	error = gfs2_glock_nq_m_atime(1, &ip->i_gh);
+	if (error)
+		goto out_uninit;
+
+	gfs2_write_calc_reserv(ip, write_len, &data_blocks, &ind_blocks);
+
+	error = gfs2_write_alloc_required(ip, pos, write_len, &alloc_required);
+	if (error)
+		goto out_unlock;
+
+
+	ip->i_alloc.al_requested = 0;
+	if (alloc_required) {
+		al = gfs2_alloc_get(ip);
+
+		error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+		if (error)
+			goto out_alloc_put;
+
+		error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+		if (error)
+			goto out_qunlock;
+
+		al->al_requested = data_blocks + ind_blocks;
+		error = gfs2_inplace_reserve(ip);
+		if (error)
+			goto out_qunlock;
+	}
+
+	rblocks = RES_DINODE + ind_blocks;
+	if (gfs2_is_jdata(ip))
+		rblocks += data_blocks ? data_blocks : 1;
+	if (ind_blocks || data_blocks)
+		rblocks += RES_STATFS + RES_QUOTA;
+
+	error = gfs2_trans_begin(sdp, rblocks, 0);
+	if (error)
+		goto out;
+
+	if (gfs2_is_stuffed(ip)) {
+		if (end > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
+			error = gfs2_unstuff_dinode(ip, page);
+			if (error == 0)
+				goto prepare_write;
+		} else if (!PageUptodate(page))
+			error = stuffed_readpage(ip, page);
+		goto out;
+	}
+
+prepare_write:
+	error = block_prepare_write(page, from, to, gfs2_get_block);
+
+out:
+	if (error) {
+		gfs2_trans_end(sdp);
+		if (alloc_required) {
+			gfs2_inplace_release(ip);
+out_qunlock:
+			gfs2_quota_unlock(ip);
+out_alloc_put:
+			gfs2_alloc_put(ip);
+		}
+out_unlock:
+		gfs2_glock_dq_m(1, &ip->i_gh);
+out_uninit:
+		gfs2_holder_uninit(&ip->i_gh);
+	}
+
+	return error;
+}
+
+/**
+ * gfs2_commit_write - Commit write to a file
+ * @file: The file to write to
+ * @page: The page containing the data
+ * @from: From (byte range within page)
+ * @to: To (byte range within page)
+ *
+ * Returns: errno
+ */
+
+static int gfs2_commit_write(struct file *file, struct page *page,
+			     unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	int error = -EOPNOTSUPP;
+	struct buffer_head *dibh;
+	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_dinode *di;
+
+	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(ip->i_gl)))
+                goto fail_nounlock;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto fail_endtrans;
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	di = (struct gfs2_dinode *)dibh->b_data;
+
+	if (gfs2_is_stuffed(ip)) {
+		u64 file_size;
+		void *kaddr;
+
+		file_size = ((u64)page->index << PAGE_CACHE_SHIFT) + to;
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		memcpy(dibh->b_data + sizeof(struct gfs2_dinode) + from,
+		       kaddr + from, to - from);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		SetPageUptodate(page);
+
+		if (inode->i_size < file_size)
+			i_size_write(inode, file_size);
+	} else {
+		if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED ||
+		    gfs2_is_jdata(ip))
+			gfs2_page_add_databufs(ip, page, from, to);
+		error = generic_commit_write(file, page, from, to);
+		if (error)
+			goto fail;
+	}
+
+	if (ip->i_di.di_size < inode->i_size) {
+		ip->i_di.di_size = inode->i_size;
+		di->di_size = cpu_to_be64(inode->i_size);
+	}
+
+	di->di_mode = cpu_to_be32(inode->i_mode);
+	di->di_atime = cpu_to_be64(inode->i_atime.tv_sec);
+	di->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec);
+	di->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec);
+
+	brelse(dibh);
+	gfs2_trans_end(sdp);
+	if (al->al_requested) {
+		gfs2_inplace_release(ip);
+		gfs2_quota_unlock(ip);
+		gfs2_alloc_put(ip);
+	}
+	gfs2_glock_dq_m(1, &ip->i_gh);
+	gfs2_holder_uninit(&ip->i_gh);
+	return 0;
+
+fail:
+	brelse(dibh);
+fail_endtrans:
+	gfs2_trans_end(sdp);
+	if (al->al_requested) {
+		gfs2_inplace_release(ip);
+		gfs2_quota_unlock(ip);
+		gfs2_alloc_put(ip);
+	}
+	gfs2_glock_dq_m(1, &ip->i_gh);
+	gfs2_holder_uninit(&ip->i_gh);
+fail_nounlock:
+	ClearPageUptodate(page);
+	return error;
+}
+
+/**
+ * gfs2_bmap - Block map function
+ * @mapping: Address space info
+ * @lblock: The block to map
+ *
+ * Returns: The disk address for the block or 0 on hole or error
+ */
+
+static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
+{
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_holder i_gh;
+	sector_t dblock = 0;
+	int error;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+	if (error)
+		return 0;
+
+	if (!gfs2_is_stuffed(ip))
+		dblock = generic_block_bmap(mapping, lblock, gfs2_get_block);
+
+	gfs2_glock_dq_uninit(&i_gh);
+
+	return dblock;
+}
+
+static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+	struct gfs2_bufdata *bd;
+
+	gfs2_log_lock(sdp);
+	bd = bh->b_private;
+	if (bd) {
+		bd->bd_bh = NULL;
+		bh->b_private = NULL;
+	}
+	gfs2_log_unlock(sdp);
+
+	lock_buffer(bh);
+	clear_buffer_dirty(bh);
+	bh->b_bdev = NULL;
+	clear_buffer_mapped(bh);
+	clear_buffer_req(bh);
+	clear_buffer_new(bh);
+	clear_buffer_delay(bh);
+	unlock_buffer(bh);
+}
+
+static void gfs2_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+	struct buffer_head *head, *bh, *next;
+	unsigned int curr_off = 0;
+
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		return;
+
+	bh = head = page_buffers(page);
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+		next = bh->b_this_page;
+
+		if (offset <= curr_off)
+			discard_buffer(sdp, bh);
+
+		curr_off = next_off;
+		bh = next;
+	} while (bh != head);
+
+	if (!offset)
+		try_to_release_page(page, 0);
+
+	return;
+}
+
+static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
+			      const struct iovec *iov, loff_t offset,
+			      unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int rv;
+
+	if (rw == READ)
+		mutex_lock(&inode->i_mutex);
+	/*
+	 * Shared lock, even if its a write, since we do no allocation
+	 * on this path. All we need change is atime.
+	 */
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+	rv = gfs2_glock_nq_m_atime(1, &gh);
+	if (rv)
+		goto out;
+
+	if (offset > i_size_read(inode))
+		goto out;
+
+	/*
+	 * Should we return an error here? I can't see that O_DIRECT for
+	 * a journaled file makes any sense. For now we'll silently fall
+	 * back to buffered I/O, likewise we do the same for stuffed
+	 * files since they are (a) small and (b) unaligned.
+	 */
+	if (gfs2_is_jdata(ip))
+		goto out;
+
+	if (gfs2_is_stuffed(ip))
+		goto out;
+
+	rv = blockdev_direct_IO_own_locking(rw, iocb, inode,
+					    inode->i_sb->s_bdev,
+					    iov, offset, nr_segs,
+					    gfs2_get_block_direct, NULL);
+out:
+	gfs2_glock_dq_m(1, &gh);
+	gfs2_holder_uninit(&gh);
+	if (rw == READ)
+		mutex_unlock(&inode->i_mutex);
+
+	return rv;
+}
+
+/**
+ * stuck_releasepage - We're stuck in gfs2_releasepage().  Print stuff out.
+ * @bh: the buffer we're stuck on
+ *
+ */
+
+static void stuck_releasepage(struct buffer_head *bh)
+{
+	struct inode *inode = bh->b_page->mapping->host;
+	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+	struct gfs2_bufdata *bd = bh->b_private;
+	struct gfs2_glock *gl;
+static unsigned limit = 0;
+
+	if (limit > 3)
+		return;
+	limit++;
+
+	fs_warn(sdp, "stuck in gfs2_releasepage() %p\n", inode);
+	fs_warn(sdp, "blkno = %llu, bh->b_count = %d\n",
+		(unsigned long long)bh->b_blocknr, atomic_read(&bh->b_count));
+	fs_warn(sdp, "pinned = %u\n", buffer_pinned(bh));
+	fs_warn(sdp, "bh->b_private = %s\n", (bd) ? "!NULL" : "NULL");
+
+	if (!bd)
+		return;
+
+	gl = bd->bd_gl;
+
+	fs_warn(sdp, "gl = (%u, %llu)\n",
+		gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number);
+
+	fs_warn(sdp, "bd_list_tr = %s, bd_le.le_list = %s\n",
+		(list_empty(&bd->bd_list_tr)) ? "no" : "yes",
+		(list_empty(&bd->bd_le.le_list)) ? "no" : "yes");
+
+	if (gl->gl_ops == &gfs2_inode_glops) {
+		struct gfs2_inode *ip = gl->gl_object;
+		unsigned int x;
+
+		if (!ip)
+			return;
+
+		fs_warn(sdp, "ip = %llu %llu\n",
+			(unsigned long long)ip->i_num.no_formal_ino,
+			(unsigned long long)ip->i_num.no_addr);
+
+		for (x = 0; x < GFS2_MAX_META_HEIGHT; x++)
+			fs_warn(sdp, "ip->i_cache[%u] = %s\n",
+				x, (ip->i_cache[x]) ? "!NULL" : "NULL");
+	}
+}
+
+/**
+ * gfs2_releasepage - free the metadata associated with a page
+ * @page: the page that's being released
+ * @gfp_mask: passed from Linux VFS, ignored by us
+ *
+ * Call try_to_free_buffers() if the buffers in this page can be
+ * released.
+ *
+ * Returns: 0
+ */
+
+int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	struct inode *aspace = page->mapping->host;
+	struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
+	struct buffer_head *bh, *head;
+	struct gfs2_bufdata *bd;
+	unsigned long t = jiffies + gfs2_tune_get(sdp, gt_stall_secs) * HZ;
+
+	if (!page_has_buffers(page))
+		goto out;
+
+	head = bh = page_buffers(page);
+	do {
+		while (atomic_read(&bh->b_count)) {
+			if (!atomic_read(&aspace->i_writecount))
+				return 0;
+
+			if (time_after_eq(jiffies, t)) {
+				stuck_releasepage(bh);
+				/* should we withdraw here? */
+				return 0;
+			}
+
+			yield();
+		}
+
+		gfs2_assert_warn(sdp, !buffer_pinned(bh));
+		gfs2_assert_warn(sdp, !buffer_dirty(bh));
+
+		gfs2_log_lock(sdp);
+		bd = bh->b_private;
+		if (bd) {
+			gfs2_assert_warn(sdp, bd->bd_bh == bh);
+			gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
+			gfs2_assert_warn(sdp, !bd->bd_ail);
+			bd->bd_bh = NULL;
+			if (!list_empty(&bd->bd_le.le_list))
+				bd = NULL;
+			bh->b_private = NULL;
+		}
+		gfs2_log_unlock(sdp);
+		if (bd)
+			kmem_cache_free(gfs2_bufdata_cachep, bd);
+
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+out:
+	return try_to_free_buffers(page);
+}
+
+const struct address_space_operations gfs2_file_aops = {
+	.writepage = gfs2_writepage,
+	.readpage = gfs2_readpage,
+	.readpages = gfs2_readpages,
+	.sync_page = block_sync_page,
+	.prepare_write = gfs2_prepare_write,
+	.commit_write = gfs2_commit_write,
+	.bmap = gfs2_bmap,
+	.invalidatepage = gfs2_invalidatepage,
+	.releasepage = gfs2_releasepage,
+	.direct_IO = gfs2_direct_IO,
+};
+
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
new file mode 100644
index 000000000000..35aaee4aa7e1
--- /dev/null
+++ b/fs/gfs2/ops_address.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __OPS_ADDRESS_DOT_H__
+#define __OPS_ADDRESS_DOT_H__
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+
+extern const struct address_space_operations gfs2_file_aops;
+extern int gfs2_get_block(struct inode *inode, sector_t lblock,
+			  struct buffer_head *bh_result, int create);
+extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
+
+#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
new file mode 100644
index 000000000000..00041b1b8025
--- /dev/null
+++ b/fs/gfs2/ops_dentry.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/smp_lock.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "ops_dentry.h"
+#include "util.h"
+
+/**
+ * gfs2_drevalidate - Check directory lookup consistency
+ * @dentry: the mapping to check
+ * @nd:
+ *
+ * Check to make sure the lookup necessary to arrive at this inode from its
+ * parent is still good.
+ *
+ * Returns: 1 if the dentry is ok, 0 if it isn't
+ */
+
+static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct dentry *parent = dget_parent(dentry);
+	struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode);
+	struct gfs2_inode *dip = GFS2_I(parent->d_inode);
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_holder d_gh;
+	struct gfs2_inode *ip;
+	struct gfs2_inum inum;
+	unsigned int type;
+	int error;
+
+	if (inode && is_bad_inode(inode))
+		goto invalid;
+
+	if (sdp->sd_args.ar_localcaching)
+		goto valid;
+
+	error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+	if (error)
+		goto fail;
+
+	error = gfs2_dir_search(parent->d_inode, &dentry->d_name, &inum, &type);
+	switch (error) {
+	case 0:
+		if (!inode)
+			goto invalid_gunlock;
+		break;
+	case -ENOENT:
+		if (!inode)
+			goto valid_gunlock;
+		goto invalid_gunlock;
+	default:
+		goto fail_gunlock;
+	}
+
+	ip = GFS2_I(inode);
+
+	if (!gfs2_inum_equal(&ip->i_num, &inum))
+		goto invalid_gunlock;
+
+	if (IF2DT(ip->i_di.di_mode) != type) {
+		gfs2_consist_inode(dip);
+		goto fail_gunlock;
+	}
+
+valid_gunlock:
+	gfs2_glock_dq_uninit(&d_gh);
+valid:
+	dput(parent);
+	return 1;
+
+invalid_gunlock:
+	gfs2_glock_dq_uninit(&d_gh);
+invalid:
+	if (inode && S_ISDIR(inode->i_mode)) {
+		if (have_submounts(dentry))
+			goto valid;
+		shrink_dcache_parent(dentry);
+	}
+	d_drop(dentry);
+	dput(parent);
+	return 0;
+
+fail_gunlock:
+	gfs2_glock_dq_uninit(&d_gh);
+fail:
+	dput(parent);
+	return 0;
+}
+
+static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
+{
+	str->hash = gfs2_disk_hash(str->name, str->len);
+	return 0;
+}
+
+struct dentry_operations gfs2_dops = {
+	.d_revalidate = gfs2_drevalidate,
+	.d_hash = gfs2_dhash,
+};
+
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
new file mode 100644
index 000000000000..5caa3db4d3f5
--- /dev/null
+++ b/fs/gfs2/ops_dentry.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __OPS_DENTRY_DOT_H__
+#define __OPS_DENTRY_DOT_H__
+
+#include <linux/dcache.h>
+
+extern struct dentry_operations gfs2_dops;
+
+#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
new file mode 100644
index 000000000000..86127d93bd35
--- /dev/null
+++ b/fs/gfs2/ops_export.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "ops_export.h"
+#include "rgrp.h"
+#include "util.h"
+
+static struct dentry *gfs2_decode_fh(struct super_block *sb,
+				     __u32 *fh,
+				     int fh_len,
+				     int fh_type,
+				     int (*acceptable)(void *context,
+						       struct dentry *dentry),
+				     void *context)
+{
+	struct gfs2_fh_obj fh_obj;
+	struct gfs2_inum *this, parent;
+
+	if (fh_type != fh_len)
+		return NULL;
+
+	this 		= &fh_obj.this;
+	fh_obj.imode 	= DT_UNKNOWN;
+	memset(&parent, 0, sizeof(struct gfs2_inum));
+
+	switch (fh_type) {
+	case GFS2_LARGE_FH_SIZE:
+		parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
+		parent.no_formal_ino |= be32_to_cpu(fh[5]);
+		parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32;
+		parent.no_addr |= be32_to_cpu(fh[7]);
+		fh_obj.imode = be32_to_cpu(fh[8]);
+	case GFS2_SMALL_FH_SIZE:
+		this->no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32;
+		this->no_formal_ino |= be32_to_cpu(fh[1]);
+		this->no_addr = ((u64)be32_to_cpu(fh[2])) << 32;
+		this->no_addr |= be32_to_cpu(fh[3]);
+		break;
+	default:
+		return NULL;
+	}
+
+	return gfs2_export_ops.find_exported_dentry(sb, &fh_obj, &parent,
+						    acceptable, context);
+}
+
+static int gfs2_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
+			  int connectable)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	if (*len < GFS2_SMALL_FH_SIZE ||
+	    (connectable && *len < GFS2_LARGE_FH_SIZE))
+		return 255;
+
+	fh[0] = ip->i_num.no_formal_ino >> 32;
+	fh[0] = cpu_to_be32(fh[0]);
+	fh[1] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
+	fh[1] = cpu_to_be32(fh[1]);
+	fh[2] = ip->i_num.no_addr >> 32;
+	fh[2] = cpu_to_be32(fh[2]);
+	fh[3] = ip->i_num.no_addr & 0xFFFFFFFF;
+	fh[3] = cpu_to_be32(fh[3]);
+	*len = GFS2_SMALL_FH_SIZE;
+
+	if (!connectable || inode == sb->s_root->d_inode)
+		return *len;
+
+	spin_lock(&dentry->d_lock);
+	inode = dentry->d_parent->d_inode;
+	ip = GFS2_I(inode);
+	igrab(inode);
+	spin_unlock(&dentry->d_lock);
+
+	fh[4] = ip->i_num.no_formal_ino >> 32;
+	fh[4] = cpu_to_be32(fh[4]);
+	fh[5] = ip->i_num.no_formal_ino & 0xFFFFFFFF;
+	fh[5] = cpu_to_be32(fh[5]);
+	fh[6] = ip->i_num.no_addr >> 32;
+	fh[6] = cpu_to_be32(fh[6]);
+	fh[7] = ip->i_num.no_addr & 0xFFFFFFFF;
+	fh[7] = cpu_to_be32(fh[7]);
+
+	fh[8]  = cpu_to_be32(inode->i_mode);
+	fh[9]  = 0;	/* pad to double word */
+	*len = GFS2_LARGE_FH_SIZE;
+
+	iput(inode);
+
+	return *len;
+}
+
+struct get_name_filldir {
+	struct gfs2_inum inum;
+	char *name;
+};
+
+static int get_name_filldir(void *opaque, const char *name, unsigned int length,
+			    u64 offset, struct gfs2_inum *inum,
+			    unsigned int type)
+{
+	struct get_name_filldir *gnfd = (struct get_name_filldir *)opaque;
+
+	if (!gfs2_inum_equal(inum, &gnfd->inum))
+		return 0;
+
+	memcpy(gnfd->name, name, length);
+	gnfd->name[length] = 0;
+
+	return 1;
+}
+
+static int gfs2_get_name(struct dentry *parent, char *name,
+			 struct dentry *child)
+{
+	struct inode *dir = parent->d_inode;
+	struct inode *inode = child->d_inode;
+	struct gfs2_inode *dip, *ip;
+	struct get_name_filldir gnfd;
+	struct gfs2_holder gh;
+	u64 offset = 0;
+	int error;
+
+	if (!dir)
+		return -EINVAL;
+
+	if (!S_ISDIR(dir->i_mode) || !inode)
+		return -EINVAL;
+
+	dip = GFS2_I(dir);
+	ip = GFS2_I(inode);
+
+	*name = 0;
+	gnfd.inum = ip->i_num;
+	gnfd.name = name;
+
+	error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
+	if (error)
+		return error;
+
+	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
+
+	gfs2_glock_dq_uninit(&gh);
+
+	if (!error && !*name)
+		error = -ENOENT;
+
+	return error;
+}
+
+static struct dentry *gfs2_get_parent(struct dentry *child)
+{
+	struct qstr dotdot;
+	struct inode *inode;
+	struct dentry *dentry;
+
+	gfs2_str2qstr(&dotdot, "..");
+	inode = gfs2_lookupi(child->d_inode, &dotdot, 1, NULL);
+
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+	/*
+	 * In case of an error, @inode carries the error value, and we
+	 * have to return that as a(n invalid) pointer to dentry.
+	 */
+	if (IS_ERR(inode))
+		return ERR_PTR(PTR_ERR(inode));
+
+	dentry = d_alloc_anon(inode);
+	if (!dentry) {
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return dentry;
+}
+
+static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_fh_obj *fh_obj = (struct gfs2_fh_obj *)inum_obj;
+	struct gfs2_inum *inum = &fh_obj->this;
+	struct gfs2_holder i_gh, ri_gh, rgd_gh;
+	struct gfs2_rgrpd *rgd;
+	struct inode *inode;
+	struct dentry *dentry;
+	int error;
+
+	/* System files? */
+
+	inode = gfs2_ilookup(sb, inum);
+	if (inode) {
+		if (GFS2_I(inode)->i_num.no_formal_ino != inum->no_formal_ino) {
+			iput(inode);
+			return ERR_PTR(-ESTALE);
+		}
+		goto out_inode;
+	}
+
+	error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
+				  LM_ST_SHARED, LM_FLAG_ANY | GL_LOCAL_EXCL,
+				  &i_gh);
+	if (error)
+		return ERR_PTR(error);
+
+	error = gfs2_rindex_hold(sdp, &ri_gh);
+	if (error)
+		goto fail;
+
+	error = -EINVAL;
+	rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
+	if (!rgd)
+		goto fail_rindex;
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
+	if (error)
+		goto fail_rindex;
+
+	error = -ESTALE;
+	if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
+		goto fail_rgd;
+
+	gfs2_glock_dq_uninit(&rgd_gh);
+	gfs2_glock_dq_uninit(&ri_gh);
+
+	inode = gfs2_inode_lookup(sb, inum, fh_obj->imode);
+	if (!inode)
+		goto fail;
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
+		goto fail;
+	}
+
+	error = gfs2_inode_refresh(GFS2_I(inode));
+	if (error) {
+		iput(inode);
+		goto fail;
+	}
+
+	error = -EIO;
+	if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) {
+		iput(inode);
+		goto fail;
+	}
+
+	gfs2_glock_dq_uninit(&i_gh);
+
+out_inode:
+	dentry = d_alloc_anon(inode);
+	if (!dentry) {
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return dentry;
+
+fail_rgd:
+	gfs2_glock_dq_uninit(&rgd_gh);
+
+fail_rindex:
+	gfs2_glock_dq_uninit(&ri_gh);
+
+fail:
+	gfs2_glock_dq_uninit(&i_gh);
+	return ERR_PTR(error);
+}
+
+struct export_operations gfs2_export_ops = {
+	.decode_fh = gfs2_decode_fh,
+	.encode_fh = gfs2_encode_fh,
+	.get_name = gfs2_get_name,
+	.get_parent = gfs2_get_parent,
+	.get_dentry = gfs2_get_dentry,
+};
+
diff --git a/fs/gfs2/ops_export.h b/fs/gfs2/ops_export.h
new file mode 100644
index 000000000000..09aca5046fb1
--- /dev/null
+++ b/fs/gfs2/ops_export.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __OPS_EXPORT_DOT_H__
+#define __OPS_EXPORT_DOT_H__
+
+#define GFS2_SMALL_FH_SIZE 4
+#define GFS2_LARGE_FH_SIZE 10
+
+extern struct export_operations gfs2_export_ops;
+struct gfs2_fh_obj {
+	struct gfs2_inum this;
+	__u32            imode;
+};
+
+#endif /* __OPS_EXPORT_DOT_H__ */
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
new file mode 100644
index 000000000000..3064f133bf3c
--- /dev/null
+++ b/fs/gfs2/ops_file.c
@@ -0,0 +1,661 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/fs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/ext2_fs.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "lm.h"
+#include "log.h"
+#include "meta_io.h"
+#include "ops_file.h"
+#include "ops_vm.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+#include "eaops.h"
+
+/* For regular, non-NFS */
+struct filldir_reg {
+	struct gfs2_sbd *fdr_sbd;
+	int fdr_prefetch;
+
+	filldir_t fdr_filldir;
+	void *fdr_opaque;
+};
+
+/*
+ * Most fields left uninitialised to catch anybody who tries to
+ * use them. f_flags set to prevent file_accessed() from touching
+ * any other part of this. Its use is purely as a flag so that we
+ * know (in readpage()) whether or not do to locking.
+ */
+struct file gfs2_internal_file_sentinel = {
+	.f_flags = O_NOATIME|O_RDONLY,
+};
+
+static int gfs2_read_actor(read_descriptor_t *desc, struct page *page,
+			   unsigned long offset, unsigned long size)
+{
+	char *kaddr;
+	unsigned long count = desc->count;
+
+	if (size > count)
+		size = count;
+
+	kaddr = kmap(page);
+	memcpy(desc->arg.buf, kaddr + offset, size);
+	kunmap(page);
+
+	desc->count = count - size;
+	desc->written += size;
+	desc->arg.buf += size;
+	return size;
+}
+
+int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
+		       char *buf, loff_t *pos, unsigned size)
+{
+	struct inode *inode = &ip->i_inode;
+	read_descriptor_t desc;
+	desc.written = 0;
+	desc.arg.buf = buf;
+	desc.count = size;
+	desc.error = 0;
+	do_generic_mapping_read(inode->i_mapping, ra_state,
+				&gfs2_internal_file_sentinel, pos, &desc,
+				gfs2_read_actor);
+	return desc.written ? desc.written : desc.error;
+}
+
+/**
+ * gfs2_llseek - seek to a location in a file
+ * @file: the file
+ * @offset: the offset
+ * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
+ *
+ * SEEK_END requires the glock for the file because it references the
+ * file's size.
+ *
+ * Returns: The new offset, or errno
+ */
+
+static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+	struct gfs2_holder i_gh;
+	loff_t error;
+
+	if (origin == 2) {
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
+					   &i_gh);
+		if (!error) {
+			error = remote_llseek(file, offset, origin);
+			gfs2_glock_dq_uninit(&i_gh);
+		}
+	} else
+		error = remote_llseek(file, offset, origin);
+
+	return error;
+}
+
+/**
+ * filldir_func - Report a directory entry to the caller of gfs2_dir_read()
+ * @opaque: opaque data used by the function
+ * @name: the name of the directory entry
+ * @length: the length of the name
+ * @offset: the entry's offset in the directory
+ * @inum: the inode number the entry points to
+ * @type: the type of inode the entry points to
+ *
+ * Returns: 0 on success, 1 if buffer full
+ */
+
+static int filldir_func(void *opaque, const char *name, unsigned int length,
+			u64 offset, struct gfs2_inum *inum,
+			unsigned int type)
+{
+	struct filldir_reg *fdr = (struct filldir_reg *)opaque;
+	struct gfs2_sbd *sdp = fdr->fdr_sbd;
+	int error;
+
+	error = fdr->fdr_filldir(fdr->fdr_opaque, name, length, offset,
+				 inum->no_addr, type);
+	if (error)
+		return 1;
+
+	if (fdr->fdr_prefetch && !(length == 1 && *name == '.')) {
+		gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_inode_glops,
+				       LM_ST_SHARED, LM_FLAG_TRY | LM_FLAG_ANY);
+		gfs2_glock_prefetch_num(sdp, inum->no_addr, &gfs2_iopen_glops,
+				       LM_ST_SHARED, LM_FLAG_TRY);
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_readdir - Read directory entries from a directory
+ * @file: The directory to read from
+ * @dirent: Buffer for dirents
+ * @filldir: Function used to do the copying
+ *
+ * Returns: errno
+ */
+
+static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+	struct inode *dir = file->f_mapping->host;
+	struct gfs2_inode *dip = GFS2_I(dir);
+	struct filldir_reg fdr;
+	struct gfs2_holder d_gh;
+	u64 offset = file->f_pos;
+	int error;
+
+	fdr.fdr_sbd = GFS2_SB(dir);
+	fdr.fdr_prefetch = 1;
+	fdr.fdr_filldir = filldir;
+	fdr.fdr_opaque = dirent;
+
+	gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
+	error = gfs2_glock_nq_atime(&d_gh);
+	if (error) {
+		gfs2_holder_uninit(&d_gh);
+		return error;
+	}
+
+	error = gfs2_dir_read(dir, &offset, &fdr, filldir_func);
+
+	gfs2_glock_dq_uninit(&d_gh);
+
+	file->f_pos = offset;
+
+	return error;
+}
+
+/**
+ * fsflags_cvt
+ * @table: A table of 32 u32 flags
+ * @val: a 32 bit value to convert
+ *
+ * This function can be used to convert between fsflags values and
+ * GFS2's own flags values.
+ *
+ * Returns: the converted flags
+ */
+static u32 fsflags_cvt(const u32 *table, u32 val)
+{
+	u32 res = 0;
+	while(val) {
+		if (val & 1)
+			res |= *table;
+		table++;
+		val >>= 1;
+	}
+	return res;
+}
+
+static const u32 fsflags_to_gfs2[32] = {
+	[3] = GFS2_DIF_SYNC,
+	[4] = GFS2_DIF_IMMUTABLE,
+	[5] = GFS2_DIF_APPENDONLY,
+	[7] = GFS2_DIF_NOATIME,
+	[12] = GFS2_DIF_EXHASH,
+	[14] = GFS2_DIF_JDATA,
+	[20] = GFS2_DIF_DIRECTIO,
+};
+
+static const u32 gfs2_to_fsflags[32] = {
+	[gfs2fl_Sync] = FS_SYNC_FL,
+	[gfs2fl_Immutable] = FS_IMMUTABLE_FL,
+	[gfs2fl_AppendOnly] = FS_APPEND_FL,
+	[gfs2fl_NoAtime] = FS_NOATIME_FL,
+	[gfs2fl_ExHash] = FS_INDEX_FL,
+	[gfs2fl_Jdata] = FS_JOURNAL_DATA_FL,
+	[gfs2fl_Directio] = FS_DIRECTIO_FL,
+	[gfs2fl_InheritDirectio] = FS_DIRECTIO_FL,
+	[gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
+};
+
+static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int error;
+	u32 fsflags;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+	error = gfs2_glock_nq_m_atime(1, &gh);
+	if (error)
+		return error;
+
+	fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
+	if (put_user(fsflags, ptr))
+		error = -EFAULT;
+
+	gfs2_glock_dq_m(1, &gh);
+	gfs2_holder_uninit(&gh);
+	return error;
+}
+
+/* Flags that can be set by user space */
+#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA|			\
+			     GFS2_DIF_DIRECTIO|			\
+			     GFS2_DIF_IMMUTABLE|		\
+			     GFS2_DIF_APPENDONLY|		\
+			     GFS2_DIF_NOATIME|			\
+			     GFS2_DIF_SYNC|			\
+			     GFS2_DIF_SYSTEM|			\
+			     GFS2_DIF_INHERIT_DIRECTIO|		\
+			     GFS2_DIF_INHERIT_JDATA)
+
+/**
+ * gfs2_set_flags - set flags on an inode
+ * @inode: The inode
+ * @flags: The flags to set
+ * @mask: Indicates which flags are valid
+ *
+ */
+static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct buffer_head *bh;
+	struct gfs2_holder gh;
+	int error;
+	u32 new_flags, flags;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (error)
+		return error;
+
+	flags = ip->i_di.di_flags;
+	new_flags = (flags & ~mask) | (reqflags & mask);
+	if ((new_flags ^ flags) == 0)
+		goto out;
+
+	if (S_ISDIR(inode->i_mode)) {
+		if ((new_flags ^ flags) & GFS2_DIF_JDATA)
+			new_flags ^= (GFS2_DIF_JDATA|GFS2_DIF_INHERIT_JDATA);
+		if ((new_flags ^ flags) & GFS2_DIF_DIRECTIO)
+			new_flags ^= (GFS2_DIF_DIRECTIO|GFS2_DIF_INHERIT_DIRECTIO);
+	}
+
+	error = -EINVAL;
+	if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
+		goto out;
+
+	error = -EPERM;
+	if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
+		goto out;
+	if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
+		goto out;
+	if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		goto out;
+	if (!IS_IMMUTABLE(inode)) {
+		error = permission(inode, MAY_WRITE, NULL);
+		if (error)
+			goto out;
+	}
+
+	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (error)
+		goto out;
+	error = gfs2_meta_inode_buffer(ip, &bh);
+	if (error)
+		goto out_trans_end;
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	ip->i_di.di_flags = new_flags;
+	gfs2_dinode_out(&ip->i_di, bh->b_data);
+	brelse(bh);
+out_trans_end:
+	gfs2_trans_end(sdp);
+out:
+	gfs2_glock_dq_uninit(&gh);
+	return error;
+}
+
+static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
+{
+	u32 fsflags, gfsflags;
+	if (get_user(fsflags, ptr))
+		return -EFAULT;
+	gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
+	return do_gfs2_set_flags(filp, gfsflags, ~0);
+}
+
+static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	switch(cmd) {
+	case FS_IOC_GETFLAGS:
+		return gfs2_get_flags(filp, (u32 __user *)arg);
+	case FS_IOC_SETFLAGS:
+		return gfs2_set_flags(filp, (u32 __user *)arg);
+	}
+	return -ENOTTY;
+}
+
+
+/**
+ * gfs2_mmap -
+ * @file: The file to map
+ * @vma: The VMA which described the mapping
+ *
+ * Returns: 0 or error code
+ */
+
+static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+	struct gfs2_holder i_gh;
+	int error;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
+	error = gfs2_glock_nq_atime(&i_gh);
+	if (error) {
+		gfs2_holder_uninit(&i_gh);
+		return error;
+	}
+
+	/* This is VM_MAYWRITE instead of VM_WRITE because a call
+	   to mprotect() can turn on VM_WRITE later. */
+
+	if ((vma->vm_flags & (VM_MAYSHARE | VM_MAYWRITE)) ==
+	    (VM_MAYSHARE | VM_MAYWRITE))
+		vma->vm_ops = &gfs2_vm_ops_sharewrite;
+	else
+		vma->vm_ops = &gfs2_vm_ops_private;
+
+	gfs2_glock_dq_uninit(&i_gh);
+
+	return error;
+}
+
+/**
+ * gfs2_open - open a file
+ * @inode: the inode to open
+ * @file: the struct file for this opening
+ *
+ * Returns: errno
+ */
+
+static int gfs2_open(struct inode *inode, struct file *file)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder i_gh;
+	struct gfs2_file *fp;
+	int error;
+
+	fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
+	if (!fp)
+		return -ENOMEM;
+
+	mutex_init(&fp->f_fl_mutex);
+
+	gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
+	file->private_data = fp;
+
+	if (S_ISREG(ip->i_di.di_mode)) {
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
+					   &i_gh);
+		if (error)
+			goto fail;
+
+		if (!(file->f_flags & O_LARGEFILE) &&
+		    ip->i_di.di_size > MAX_NON_LFS) {
+			error = -EFBIG;
+			goto fail_gunlock;
+		}
+
+		/* Listen to the Direct I/O flag */
+
+		if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
+			file->f_flags |= O_DIRECT;
+
+		gfs2_glock_dq_uninit(&i_gh);
+	}
+
+	return 0;
+
+fail_gunlock:
+	gfs2_glock_dq_uninit(&i_gh);
+fail:
+	file->private_data = NULL;
+	kfree(fp);
+	return error;
+}
+
+/**
+ * gfs2_close - called to close a struct file
+ * @inode: the inode the struct file belongs to
+ * @file: the struct file being closed
+ *
+ * Returns: errno
+ */
+
+static int gfs2_close(struct inode *inode, struct file *file)
+{
+	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+	struct gfs2_file *fp;
+
+	fp = file->private_data;
+	file->private_data = NULL;
+
+	if (gfs2_assert_warn(sdp, fp))
+		return -EIO;
+
+	kfree(fp);
+
+	return 0;
+}
+
+/**
+ * gfs2_fsync - sync the dirty data for a file (across the cluster)
+ * @file: the file that points to the dentry (we ignore this)
+ * @dentry: the dentry that points to the inode to sync
+ *
+ * Returns: errno
+ */
+
+static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+
+	gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
+
+	return 0;
+}
+
+/**
+ * gfs2_lock - acquire/release a posix lock on a file
+ * @file: the file pointer
+ * @cmd: either modify or retrieve lock state, possibly wait
+ * @fl: type and range of lock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
+	struct lm_lockname name =
+		{ .ln_number = ip->i_num.no_addr,
+		  .ln_type = LM_TYPE_PLOCK };
+
+	if (!(fl->fl_flags & FL_POSIX))
+		return -ENOLCK;
+	if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+		return -ENOLCK;
+
+	if (sdp->sd_args.ar_localflocks) {
+		if (IS_GETLK(cmd)) {
+			struct file_lock tmp;
+			int ret;
+			ret = posix_test_lock(file, fl, &tmp);
+			fl->fl_type = F_UNLCK;
+			if (ret)
+				memcpy(fl, &tmp, sizeof(struct file_lock));
+			return 0;
+		} else {
+			return posix_lock_file_wait(file, fl);
+		}
+	}
+
+	if (IS_GETLK(cmd))
+		return gfs2_lm_plock_get(sdp, &name, file, fl);
+	else if (fl->fl_type == F_UNLCK)
+		return gfs2_lm_punlock(sdp, &name, file, fl);
+	else
+		return gfs2_lm_plock(sdp, &name, file, cmd, fl);
+}
+
+static int do_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct gfs2_file *fp = file->private_data;
+	struct gfs2_holder *fl_gh = &fp->f_fl_gh;
+	struct gfs2_inode *ip = GFS2_I(file->f_dentry->d_inode);
+	struct gfs2_glock *gl;
+	unsigned int state;
+	int flags;
+	int error = 0;
+
+	state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
+	flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
+
+	mutex_lock(&fp->f_fl_mutex);
+
+	gl = fl_gh->gh_gl;
+	if (gl) {
+		if (fl_gh->gh_state == state)
+			goto out;
+		gfs2_glock_hold(gl);
+		flock_lock_file_wait(file,
+				     &(struct file_lock){.fl_type = F_UNLCK});
+		gfs2_glock_dq_uninit(fl_gh);
+	} else {
+		error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
+				      ip->i_num.no_addr, &gfs2_flock_glops,
+				      CREATE, &gl);
+		if (error)
+			goto out;
+	}
+
+	gfs2_holder_init(gl, state, flags, fl_gh);
+	gfs2_glock_put(gl);
+
+	error = gfs2_glock_nq(fl_gh);
+	if (error) {
+		gfs2_holder_uninit(fl_gh);
+		if (error == GLR_TRYFAILED)
+			error = -EAGAIN;
+	} else {
+		error = flock_lock_file_wait(file, fl);
+		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
+	}
+
+out:
+	mutex_unlock(&fp->f_fl_mutex);
+	return error;
+}
+
+static void do_unflock(struct file *file, struct file_lock *fl)
+{
+	struct gfs2_file *fp = file->private_data;
+	struct gfs2_holder *fl_gh = &fp->f_fl_gh;
+
+	mutex_lock(&fp->f_fl_mutex);
+	flock_lock_file_wait(file, fl);
+	if (fl_gh->gh_gl)
+		gfs2_glock_dq_uninit(fl_gh);
+	mutex_unlock(&fp->f_fl_mutex);
+}
+
+/**
+ * gfs2_flock - acquire/release a flock lock on a file
+ * @file: the file pointer
+ * @cmd: either modify or retrieve lock state, possibly wait
+ * @fl: type and range of lock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
+
+	if (!(fl->fl_flags & FL_FLOCK))
+		return -ENOLCK;
+	if ((ip->i_di.di_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+		return -ENOLCK;
+
+	if (sdp->sd_args.ar_localflocks)
+		return flock_lock_file_wait(file, fl);
+
+	if (fl->fl_type == F_UNLCK) {
+		do_unflock(file, fl);
+		return 0;
+	} else {
+		return do_flock(file, cmd, fl);
+	}
+}
+
+const struct file_operations gfs2_file_fops = {
+	.llseek		= gfs2_llseek,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.mmap		= gfs2_mmap,
+	.open		= gfs2_open,
+	.release	= gfs2_close,
+	.fsync		= gfs2_fsync,
+	.lock		= gfs2_lock,
+	.sendfile	= generic_file_sendfile,
+	.flock		= gfs2_flock,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+};
+
+const struct file_operations gfs2_dir_fops = {
+	.readdir	= gfs2_readdir,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.open		= gfs2_open,
+	.release	= gfs2_close,
+	.fsync		= gfs2_fsync,
+	.lock		= gfs2_lock,
+	.flock		= gfs2_flock,
+};
+
diff --git a/fs/gfs2/ops_file.h b/fs/gfs2/ops_file.h
new file mode 100644
index 000000000000..ce319f89ec8e
--- /dev/null
+++ b/fs/gfs2/ops_file.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __OPS_FILE_DOT_H__
+#define __OPS_FILE_DOT_H__
+
+#include <linux/fs.h>
+struct gfs2_inode;
+
+extern struct file gfs2_internal_file_sentinel;
+extern int gfs2_internal_read(struct gfs2_inode *ip,
+			      struct file_ra_state *ra_state,
+			      char *buf, loff_t *pos, unsigned size);
+
+extern const struct file_operations gfs2_file_fops;
+extern const struct file_operations gfs2_dir_fops;
+
+#endif /* __OPS_FILE_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
new file mode 100644
index 000000000000..882873a6bd69
--- /dev/null
+++ b/fs/gfs2/ops_fstype.c
@@ -0,0 +1,925 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/kthread.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "daemon.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "lm.h"
+#include "mount.h"
+#include "ops_export.h"
+#include "ops_fstype.h"
+#include "ops_super.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "super.h"
+#include "sys.h"
+#include "util.h"
+
+#define DO 0
+#define UNDO 1
+
+extern struct dentry_operations gfs2_dops;
+
+static struct gfs2_sbd *init_sbd(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp;
+
+	sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
+	if (!sdp)
+		return NULL;
+
+	sb->s_fs_info = sdp;
+	sdp->sd_vfs = sb;
+
+	gfs2_tune_init(&sdp->sd_tune);
+
+	INIT_LIST_HEAD(&sdp->sd_reclaim_list);
+	spin_lock_init(&sdp->sd_reclaim_lock);
+	init_waitqueue_head(&sdp->sd_reclaim_wq);
+
+	mutex_init(&sdp->sd_inum_mutex);
+	spin_lock_init(&sdp->sd_statfs_spin);
+	mutex_init(&sdp->sd_statfs_mutex);
+
+	spin_lock_init(&sdp->sd_rindex_spin);
+	mutex_init(&sdp->sd_rindex_mutex);
+	INIT_LIST_HEAD(&sdp->sd_rindex_list);
+	INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
+	INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
+
+	INIT_LIST_HEAD(&sdp->sd_jindex_list);
+	spin_lock_init(&sdp->sd_jindex_spin);
+	mutex_init(&sdp->sd_jindex_mutex);
+
+	INIT_LIST_HEAD(&sdp->sd_quota_list);
+	spin_lock_init(&sdp->sd_quota_spin);
+	mutex_init(&sdp->sd_quota_mutex);
+
+	spin_lock_init(&sdp->sd_log_lock);
+
+	INIT_LIST_HEAD(&sdp->sd_log_le_gl);
+	INIT_LIST_HEAD(&sdp->sd_log_le_buf);
+	INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
+	INIT_LIST_HEAD(&sdp->sd_log_le_rg);
+	INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
+
+	mutex_init(&sdp->sd_log_reserve_mutex);
+	INIT_LIST_HEAD(&sdp->sd_ail1_list);
+	INIT_LIST_HEAD(&sdp->sd_ail2_list);
+
+	init_rwsem(&sdp->sd_log_flush_lock);
+	INIT_LIST_HEAD(&sdp->sd_log_flush_list);
+
+	INIT_LIST_HEAD(&sdp->sd_revoke_list);
+
+	mutex_init(&sdp->sd_freeze_lock);
+
+	return sdp;
+}
+
+static void init_vfs(struct super_block *sb, unsigned noatime)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+
+	sb->s_magic = GFS2_MAGIC;
+	sb->s_op = &gfs2_super_ops;
+	sb->s_export_op = &gfs2_export_ops;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+
+	if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
+		set_bit(noatime, &sdp->sd_flags);
+
+	/* Don't let the VFS update atimes.  GFS2 handles this itself. */
+	sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
+}
+
+static int init_names(struct gfs2_sbd *sdp, int silent)
+{
+	struct page *page;
+	char *proto, *table;
+	int error = 0;
+
+	proto = sdp->sd_args.ar_lockproto;
+	table = sdp->sd_args.ar_locktable;
+
+	/*  Try to autodetect  */
+
+	if (!proto[0] || !table[0]) {
+		struct gfs2_sb *sb;
+		page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
+		if (!page)
+			return -ENOBUFS;
+		sb = kmap(page);
+		gfs2_sb_in(&sdp->sd_sb, sb);
+		kunmap(page);
+		__free_page(page);
+
+		error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
+		if (error)
+			goto out;
+
+		if (!proto[0])
+			proto = sdp->sd_sb.sb_lockproto;
+		if (!table[0])
+			table = sdp->sd_sb.sb_locktable;
+	}
+
+	if (!table[0])
+		table = sdp->sd_vfs->s_id;
+
+	snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto);
+	snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table);
+
+out:
+	return error;
+}
+
+static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
+			int undo)
+{
+	struct task_struct *p;
+	int error = 0;
+
+	if (undo)
+		goto fail_trans;
+
+	p = kthread_run(gfs2_scand, sdp, "gfs2_scand");
+	error = IS_ERR(p);
+	if (error) {
+		fs_err(sdp, "can't start scand thread: %d\n", error);
+		return error;
+	}
+	sdp->sd_scand_process = p;
+
+	for (sdp->sd_glockd_num = 0;
+	     sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
+	     sdp->sd_glockd_num++) {
+		p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
+		error = IS_ERR(p);
+		if (error) {
+			fs_err(sdp, "can't start glockd thread: %d\n", error);
+			goto fail;
+		}
+		sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
+	}
+
+	error = gfs2_glock_nq_num(sdp,
+				  GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
+				  LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
+				  mount_gh);
+	if (error) {
+		fs_err(sdp, "can't acquire mount glock: %d\n", error);
+		goto fail;
+	}
+
+	error = gfs2_glock_nq_num(sdp,
+				  GFS2_LIVE_LOCK, &gfs2_nondisk_glops,
+				  LM_ST_SHARED,
+				  LM_FLAG_NOEXP | GL_EXACT,
+				  &sdp->sd_live_gh);
+	if (error) {
+		fs_err(sdp, "can't acquire live glock: %d\n", error);
+		goto fail_mount;
+	}
+
+	error = gfs2_glock_get(sdp, GFS2_RENAME_LOCK, &gfs2_nondisk_glops,
+			       CREATE, &sdp->sd_rename_gl);
+	if (error) {
+		fs_err(sdp, "can't create rename glock: %d\n", error);
+		goto fail_live;
+	}
+
+	error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops,
+			       CREATE, &sdp->sd_trans_gl);
+	if (error) {
+		fs_err(sdp, "can't create transaction glock: %d\n", error);
+		goto fail_rename;
+	}
+	set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
+
+	return 0;
+
+fail_trans:
+	gfs2_glock_put(sdp->sd_trans_gl);
+fail_rename:
+	gfs2_glock_put(sdp->sd_rename_gl);
+fail_live:
+	gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+fail_mount:
+	gfs2_glock_dq_uninit(mount_gh);
+fail:
+	while (sdp->sd_glockd_num--)
+		kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
+
+	kthread_stop(sdp->sd_scand_process);
+	return error;
+}
+
+static struct inode *gfs2_lookup_root(struct super_block *sb,
+				      struct gfs2_inum *inum)
+{
+	return gfs2_inode_lookup(sb, inum, DT_DIR);
+}
+
+static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	struct gfs2_holder sb_gh;
+	struct gfs2_inum *inum;
+	struct inode *inode;
+	int error = 0;
+
+	if (undo) {
+		if (sb->s_root) {
+			dput(sb->s_root);
+			sb->s_root = NULL;
+		}
+		return 0;
+	}
+
+	error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
+				 LM_ST_SHARED, 0, &sb_gh);
+	if (error) {
+		fs_err(sdp, "can't acquire superblock glock: %d\n", error);
+		return error;
+	}
+
+	error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
+	if (error) {
+		fs_err(sdp, "can't read superblock: %d\n", error);
+		goto out;
+	}
+
+	/* Set up the buffer cache and SB for real */
+	if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
+		error = -EINVAL;
+		fs_err(sdp, "FS block size (%u) is too small for device "
+		       "block size (%u)\n",
+		       sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
+		goto out;
+	}
+	if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
+		error = -EINVAL;
+		fs_err(sdp, "FS block size (%u) is too big for machine "
+		       "page size (%u)\n",
+		       sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
+		goto out;
+	}
+	sb_set_blocksize(sb, sdp->sd_sb.sb_bsize);
+
+	/* Get the root inode */
+	inum = &sdp->sd_sb.sb_root_dir;
+	if (sb->s_type == &gfs2meta_fs_type)
+		inum = &sdp->sd_sb.sb_master_dir;
+	inode = gfs2_lookup_root(sb, inum);
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
+		fs_err(sdp, "can't read in root inode: %d\n", error);
+		goto out;
+	}
+
+	sb->s_root = d_alloc_root(inode);
+	if (!sb->s_root) {
+		fs_err(sdp, "can't get root dentry\n");
+		error = -ENOMEM;
+		iput(inode);
+	}
+	sb->s_root->d_op = &gfs2_dops;
+out:
+	gfs2_glock_dq_uninit(&sb_gh);
+	return error;
+}
+
+static int init_journal(struct gfs2_sbd *sdp, int undo)
+{
+	struct gfs2_holder ji_gh;
+	struct task_struct *p;
+	struct gfs2_inode *ip;
+	int jindex = 1;
+	int error = 0;
+
+	if (undo) {
+		jindex = 0;
+		goto fail_recoverd;
+	}
+
+	sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex");
+	if (IS_ERR(sdp->sd_jindex)) {
+		fs_err(sdp, "can't lookup journal index: %d\n", error);
+		return PTR_ERR(sdp->sd_jindex);
+	}
+	ip = GFS2_I(sdp->sd_jindex);
+	set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
+
+	/* Load in the journal index special file */
+
+	error = gfs2_jindex_hold(sdp, &ji_gh);
+	if (error) {
+		fs_err(sdp, "can't read journal index: %d\n", error);
+		goto fail;
+	}
+
+	error = -EINVAL;
+	if (!gfs2_jindex_size(sdp)) {
+		fs_err(sdp, "no journals!\n");
+		goto fail_jindex;
+	}
+
+	if (sdp->sd_args.ar_spectator) {
+		sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
+		sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
+	} else {
+		if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
+			fs_err(sdp, "can't mount journal #%u\n",
+			       sdp->sd_lockstruct.ls_jid);
+			fs_err(sdp, "there are only %u journals (0 - %u)\n",
+			       gfs2_jindex_size(sdp),
+			       gfs2_jindex_size(sdp) - 1);
+			goto fail_jindex;
+		}
+		sdp->sd_jdesc = gfs2_jdesc_find(sdp, sdp->sd_lockstruct.ls_jid);
+
+		error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid,
+					  &gfs2_journal_glops,
+					  LM_ST_EXCLUSIVE, LM_FLAG_NOEXP,
+					  &sdp->sd_journal_gh);
+		if (error) {
+			fs_err(sdp, "can't acquire journal glock: %d\n", error);
+			goto fail_jindex;
+		}
+
+		ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
+					   LM_FLAG_NOEXP | GL_EXACT,
+					   &sdp->sd_jinode_gh);
+		if (error) {
+			fs_err(sdp, "can't acquire journal inode glock: %d\n",
+			       error);
+			goto fail_journal_gh;
+		}
+
+		error = gfs2_jdesc_check(sdp->sd_jdesc);
+		if (error) {
+			fs_err(sdp, "my journal (%u) is bad: %d\n",
+			       sdp->sd_jdesc->jd_jid, error);
+			goto fail_jinode_gh;
+		}
+		sdp->sd_log_blks_free = sdp->sd_jdesc->jd_blocks;
+	}
+
+	if (sdp->sd_lockstruct.ls_first) {
+		unsigned int x;
+		for (x = 0; x < sdp->sd_journals; x++) {
+			error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x));
+			if (error) {
+				fs_err(sdp, "error recovering journal %u: %d\n",
+				       x, error);
+				goto fail_jinode_gh;
+			}
+		}
+
+		gfs2_lm_others_may_mount(sdp);
+	} else if (!sdp->sd_args.ar_spectator) {
+		error = gfs2_recover_journal(sdp->sd_jdesc);
+		if (error) {
+			fs_err(sdp, "error recovering my journal: %d\n", error);
+			goto fail_jinode_gh;
+		}
+	}
+
+	set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags);
+	gfs2_glock_dq_uninit(&ji_gh);
+	jindex = 0;
+
+	p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
+	error = IS_ERR(p);
+	if (error) {
+		fs_err(sdp, "can't start recoverd thread: %d\n", error);
+		goto fail_jinode_gh;
+	}
+	sdp->sd_recoverd_process = p;
+
+	return 0;
+
+fail_recoverd:
+	kthread_stop(sdp->sd_recoverd_process);
+fail_jinode_gh:
+	if (!sdp->sd_args.ar_spectator)
+		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+fail_journal_gh:
+	if (!sdp->sd_args.ar_spectator)
+		gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+fail_jindex:
+	gfs2_jindex_free(sdp);
+	if (jindex)
+		gfs2_glock_dq_uninit(&ji_gh);
+fail:
+	iput(sdp->sd_jindex);
+	return error;
+}
+
+
+static int init_inodes(struct gfs2_sbd *sdp, int undo)
+{
+	int error = 0;
+	struct gfs2_inode *ip;
+	struct inode *inode;
+
+	if (undo)
+		goto fail_qinode;
+
+	inode = gfs2_lookup_root(sdp->sd_vfs, &sdp->sd_sb.sb_master_dir);
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
+		fs_err(sdp, "can't read in master directory: %d\n", error);
+		goto fail;
+	}
+	sdp->sd_master_dir = inode;
+
+	error = init_journal(sdp, undo);
+	if (error)
+		goto fail_master;
+
+	/* Read in the master inode number inode */
+	sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum");
+	if (IS_ERR(sdp->sd_inum_inode)) {
+		error = PTR_ERR(sdp->sd_inum_inode);
+		fs_err(sdp, "can't read in inum inode: %d\n", error);
+		goto fail_journal;
+	}
+
+
+	/* Read in the master statfs inode */
+	sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs");
+	if (IS_ERR(sdp->sd_statfs_inode)) {
+		error = PTR_ERR(sdp->sd_statfs_inode);
+		fs_err(sdp, "can't read in statfs inode: %d\n", error);
+		goto fail_inum;
+	}
+
+	/* Read in the resource index inode */
+	sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex");
+	if (IS_ERR(sdp->sd_rindex)) {
+		error = PTR_ERR(sdp->sd_rindex);
+		fs_err(sdp, "can't get resource index inode: %d\n", error);
+		goto fail_statfs;
+	}
+	ip = GFS2_I(sdp->sd_rindex);
+	set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
+	sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1;
+
+	/* Read in the quota inode */
+	sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
+	if (IS_ERR(sdp->sd_quota_inode)) {
+		error = PTR_ERR(sdp->sd_quota_inode);
+		fs_err(sdp, "can't get quota file inode: %d\n", error);
+		goto fail_rindex;
+	}
+	return 0;
+
+fail_qinode:
+	iput(sdp->sd_quota_inode);
+fail_rindex:
+	gfs2_clear_rgrpd(sdp);
+	iput(sdp->sd_rindex);
+fail_statfs:
+	iput(sdp->sd_statfs_inode);
+fail_inum:
+	iput(sdp->sd_inum_inode);
+fail_journal:
+	init_journal(sdp, UNDO);
+fail_master:
+	iput(sdp->sd_master_dir);
+fail:
+	return error;
+}
+
+static int init_per_node(struct gfs2_sbd *sdp, int undo)
+{
+	struct inode *pn = NULL;
+	char buf[30];
+	int error = 0;
+	struct gfs2_inode *ip;
+
+	if (sdp->sd_args.ar_spectator)
+		return 0;
+
+	if (undo)
+		goto fail_qc_gh;
+
+	pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node");
+	if (IS_ERR(pn)) {
+		error = PTR_ERR(pn);
+		fs_err(sdp, "can't find per_node directory: %d\n", error);
+		return error;
+	}
+
+	sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
+	sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
+	if (IS_ERR(sdp->sd_ir_inode)) {
+		error = PTR_ERR(sdp->sd_ir_inode);
+		fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
+		goto fail;
+	}
+
+	sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
+	sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
+	if (IS_ERR(sdp->sd_sc_inode)) {
+		error = PTR_ERR(sdp->sd_sc_inode);
+		fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
+		goto fail_ir_i;
+	}
+
+	sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
+	sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf);
+	if (IS_ERR(sdp->sd_qc_inode)) {
+		error = PTR_ERR(sdp->sd_qc_inode);
+		fs_err(sdp, "can't find local \"qc\" file: %d\n", error);
+		goto fail_ut_i;
+	}
+
+	iput(pn);
+	pn = NULL;
+
+	ip = GFS2_I(sdp->sd_ir_inode);
+	error = gfs2_glock_nq_init(ip->i_gl,
+				   LM_ST_EXCLUSIVE, 0,
+				   &sdp->sd_ir_gh);
+	if (error) {
+		fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
+		goto fail_qc_i;
+	}
+
+	ip = GFS2_I(sdp->sd_sc_inode);
+	error = gfs2_glock_nq_init(ip->i_gl,
+				   LM_ST_EXCLUSIVE, 0,
+				   &sdp->sd_sc_gh);
+	if (error) {
+		fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
+		goto fail_ir_gh;
+	}
+
+	ip = GFS2_I(sdp->sd_qc_inode);
+	error = gfs2_glock_nq_init(ip->i_gl,
+				   LM_ST_EXCLUSIVE, 0,
+				   &sdp->sd_qc_gh);
+	if (error) {
+		fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
+		goto fail_ut_gh;
+	}
+
+	return 0;
+
+fail_qc_gh:
+	gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+fail_ut_gh:
+	gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+fail_ir_gh:
+	gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
+fail_qc_i:
+	iput(sdp->sd_qc_inode);
+fail_ut_i:
+	iput(sdp->sd_sc_inode);
+fail_ir_i:
+	iput(sdp->sd_ir_inode);
+fail:
+	if (pn)
+		iput(pn);
+	return error;
+}
+
+static int init_threads(struct gfs2_sbd *sdp, int undo)
+{
+	struct task_struct *p;
+	int error = 0;
+
+	if (undo)
+		goto fail_quotad;
+
+	sdp->sd_log_flush_time = jiffies;
+	sdp->sd_jindex_refresh_time = jiffies;
+
+	p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
+	error = IS_ERR(p);
+	if (error) {
+		fs_err(sdp, "can't start logd thread: %d\n", error);
+		return error;
+	}
+	sdp->sd_logd_process = p;
+
+	sdp->sd_statfs_sync_time = jiffies;
+	sdp->sd_quota_sync_time = jiffies;
+
+	p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
+	error = IS_ERR(p);
+	if (error) {
+		fs_err(sdp, "can't start quotad thread: %d\n", error);
+		goto fail;
+	}
+	sdp->sd_quotad_process = p;
+
+	return 0;
+
+
+fail_quotad:
+	kthread_stop(sdp->sd_quotad_process);
+fail:
+	kthread_stop(sdp->sd_logd_process);
+	return error;
+}
+
+/**
+ * fill_super - Read in superblock
+ * @sb: The VFS superblock
+ * @data: Mount options
+ * @silent: Don't complain if it's not a GFS2 filesystem
+ *
+ * Returns: errno
+ */
+
+static int fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct gfs2_sbd *sdp;
+	struct gfs2_holder mount_gh;
+	int error;
+
+	sdp = init_sbd(sb);
+	if (!sdp) {
+		printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
+		return -ENOMEM;
+	}
+
+	error = gfs2_mount_args(sdp, (char *)data, 0);
+	if (error) {
+		printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
+		goto fail;
+	}
+
+	init_vfs(sb, SDF_NOATIME);
+
+	/* Set up the buffer cache and fill in some fake block size values
+	   to allow us to read-in the on-disk superblock. */
+	sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK);
+	sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
+	sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
+                               GFS2_BASIC_BLOCK_SHIFT;
+	sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+
+	error = init_names(sdp, silent);
+	if (error)
+		goto fail;
+
+	error = gfs2_sys_fs_add(sdp);
+	if (error)
+		goto fail;
+
+	error = gfs2_lm_mount(sdp, silent);
+	if (error)
+		goto fail_sys;
+
+	error = init_locking(sdp, &mount_gh, DO);
+	if (error)
+		goto fail_lm;
+
+	error = init_sb(sdp, silent, DO);
+	if (error)
+		goto fail_locking;
+
+	error = init_inodes(sdp, DO);
+	if (error)
+		goto fail_sb;
+
+	error = init_per_node(sdp, DO);
+	if (error)
+		goto fail_inodes;
+
+	error = gfs2_statfs_init(sdp);
+	if (error) {
+		fs_err(sdp, "can't initialize statfs subsystem: %d\n", error);
+		goto fail_per_node;
+	}
+
+	error = init_threads(sdp, DO);
+	if (error)
+		goto fail_per_node;
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		error = gfs2_make_fs_rw(sdp);
+		if (error) {
+			fs_err(sdp, "can't make FS RW: %d\n", error);
+			goto fail_threads;
+		}
+	}
+
+	gfs2_glock_dq_uninit(&mount_gh);
+
+	return 0;
+
+fail_threads:
+	init_threads(sdp, UNDO);
+fail_per_node:
+	init_per_node(sdp, UNDO);
+fail_inodes:
+	init_inodes(sdp, UNDO);
+fail_sb:
+	init_sb(sdp, 0, UNDO);
+fail_locking:
+	init_locking(sdp, &mount_gh, UNDO);
+fail_lm:
+	gfs2_gl_hash_clear(sdp, WAIT);
+	gfs2_lm_unmount(sdp);
+	while (invalidate_inodes(sb))
+		yield();
+fail_sys:
+	gfs2_sys_fs_del(sdp);
+fail:
+	kfree(sdp);
+	sb->s_fs_info = NULL;
+	return error;
+}
+
+static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
+		const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	struct super_block *sb;
+	struct gfs2_sbd *sdp;
+	int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
+	if (error)
+		goto out;
+	sb = mnt->mnt_sb;
+	sdp = sb->s_fs_info;
+	sdp->sd_gfs2mnt = mnt;
+out:
+	return error;
+}
+
+static int fill_super_meta(struct super_block *sb, struct super_block *new,
+			   void *data, int silent)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct inode *inode;
+	int error = 0;
+
+	new->s_fs_info = sdp;
+	sdp->sd_vfs_meta = sb;
+
+	init_vfs(new, SDF_NOATIME);
+
+        /* Get the master inode */
+	inode = igrab(sdp->sd_master_dir);
+
+	new->s_root = d_alloc_root(inode);
+	if (!new->s_root) {
+		fs_err(sdp, "can't get root dentry\n");
+		error = -ENOMEM;
+		iput(inode);
+	} else
+		new->s_root->d_op = &gfs2_dops;
+
+	return error;
+}
+
+static int set_bdev_super(struct super_block *s, void *data)
+{
+	s->s_bdev = data;
+	s->s_dev = s->s_bdev->bd_dev;
+	return 0;
+}
+
+static int test_bdev_super(struct super_block *s, void *data)
+{
+	return s->s_bdev == data;
+}
+
+static struct super_block* get_gfs2_sb(const char *dev_name)
+{
+	struct kstat stat;
+	struct nameidata nd;
+	struct file_system_type *fstype;
+	struct super_block *sb = NULL, *s;
+	struct list_head *l;
+	int error;
+
+	error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
+	if (error) {
+		printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n",
+		       dev_name);
+		goto out;
+	}
+	error = vfs_getattr(nd.mnt, nd.dentry, &stat);
+
+	fstype = get_fs_type("gfs2");
+	list_for_each(l, &fstype->fs_supers) {
+		s = list_entry(l, struct super_block, s_instances);
+		if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
+		    (S_ISDIR(stat.mode) && s == nd.dentry->d_inode->i_sb)) {
+			sb = s;
+			goto free_nd;
+		}
+	}
+
+	printk(KERN_WARNING "GFS2: Unrecognized block device or "
+	       "mount point %s", dev_name);
+
+free_nd:
+	path_release(&nd);
+out:
+	return sb;
+}
+
+static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
+			    const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	int error = 0;
+	struct super_block *sb = NULL, *new;
+	struct gfs2_sbd *sdp;
+
+	sb = get_gfs2_sb(dev_name);
+	if (!sb) {
+		printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
+		error = -ENOENT;
+		goto error;
+	}
+	sdp = (struct gfs2_sbd*) sb->s_fs_info;
+	if (sdp->sd_vfs_meta) {
+		printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n");
+		error = -EBUSY;
+		goto error;
+	}
+	mutex_lock(&sb->s_bdev->bd_mount_mutex);
+	new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev);
+	mutex_unlock(&sb->s_bdev->bd_mount_mutex);
+	if (IS_ERR(new)) {
+		error = PTR_ERR(new);
+		goto error;
+	}
+	module_put(fs_type->owner);
+	new->s_flags = flags;
+	strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
+	sb_set_blocksize(new, sb->s_blocksize);
+	error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0);
+	if (error) {
+		up_write(&new->s_umount);
+		deactivate_super(new);
+		goto error;
+	}
+
+	new->s_flags |= MS_ACTIVE;
+
+	/* Grab a reference to the gfs2 mount point */
+	atomic_inc(&sdp->sd_gfs2mnt->mnt_count);
+	return simple_set_mnt(mnt, new);
+error:
+	return error;
+}
+
+static void gfs2_kill_sb(struct super_block *sb)
+{
+	kill_block_super(sb);
+}
+
+static void gfs2_kill_sb_meta(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	generic_shutdown_super(sb);
+	sdp->sd_vfs_meta = NULL;
+	atomic_dec(&sdp->sd_gfs2mnt->mnt_count);
+}
+
+struct file_system_type gfs2_fs_type = {
+	.name = "gfs2",
+	.fs_flags = FS_REQUIRES_DEV,
+	.get_sb = gfs2_get_sb,
+	.kill_sb = gfs2_kill_sb,
+	.owner = THIS_MODULE,
+};
+
+struct file_system_type gfs2meta_fs_type = {
+	.name = "gfs2meta",
+	.fs_flags = FS_REQUIRES_DEV,
+	.get_sb = gfs2_get_sb_meta,
+	.kill_sb = gfs2_kill_sb_meta,
+	.owner = THIS_MODULE,
+};
+
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
new file mode 100644
index 000000000000..7cc2c296271b
--- /dev/null
+++ b/fs/gfs2/ops_fstype.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __OPS_FSTYPE_DOT_H__
+#define __OPS_FSTYPE_DOT_H__
+
+#include <linux/fs.h>
+
+extern struct file_system_type gfs2_fs_type;
+extern struct file_system_type gfs2meta_fs_type;
+
+#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
new file mode 100644
index 000000000000..ef6e5ed70e94
--- /dev/null
+++ b/fs/gfs2/ops_inode.c
@@ -0,0 +1,1151 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/namei.h>
+#include <linux/utsname.h>
+#include <linux/mm.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "acl.h"
+#include "bmap.h"
+#include "dir.h"
+#include "eaops.h"
+#include "eattr.h"
+#include "glock.h"
+#include "inode.h"
+#include "meta_io.h"
+#include "ops_dentry.h"
+#include "ops_inode.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+
+/**
+ * gfs2_create - Create a file
+ * @dir: The directory in which to create the file
+ * @dentry: The dentry of the new file
+ * @mode: The mode of the new file
+ *
+ * Returns: errno
+ */
+
+static int gfs2_create(struct inode *dir, struct dentry *dentry,
+		       int mode, struct nameidata *nd)
+{
+	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_sbd *sdp = GFS2_SB(dir);
+	struct gfs2_holder ghs[2];
+	struct inode *inode;
+
+	gfs2_holder_init(dip->i_gl, 0, 0, ghs);
+
+	for (;;) {
+		inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode);
+		if (!IS_ERR(inode)) {
+			gfs2_trans_end(sdp);
+			if (dip->i_alloc.al_rgd)
+				gfs2_inplace_release(dip);
+			gfs2_quota_unlock(dip);
+			gfs2_alloc_put(dip);
+			gfs2_glock_dq_uninit_m(2, ghs);
+			mark_inode_dirty(inode);
+			break;
+		} else if (PTR_ERR(inode) != -EEXIST ||
+			   (nd->intent.open.flags & O_EXCL)) {
+			gfs2_holder_uninit(ghs);
+			return PTR_ERR(inode);
+		}
+
+		inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
+		if (inode) {
+			if (!IS_ERR(inode)) {
+				gfs2_holder_uninit(ghs);
+				break;
+			} else {
+				gfs2_holder_uninit(ghs);
+				return PTR_ERR(inode);
+			}
+		}
+	}
+
+	d_instantiate(dentry, inode);
+
+	return 0;
+}
+
+/**
+ * gfs2_lookup - Look up a filename in a directory and return its inode
+ * @dir: The directory inode
+ * @dentry: The dentry of the new inode
+ * @nd: passed from Linux VFS, ignored by us
+ *
+ * Called by the VFS layer. Lock dir and call gfs2_lookupi()
+ *
+ * Returns: errno
+ */
+
+static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
+				  struct nameidata *nd)
+{
+	struct inode *inode = NULL;
+
+	dentry->d_op = &gfs2_dops;
+
+	inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
+	if (inode && IS_ERR(inode))
+		return ERR_PTR(PTR_ERR(inode));
+
+	if (inode)
+		return d_splice_alias(inode, dentry);
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+/**
+ * gfs2_link - Link to a file
+ * @old_dentry: The inode to link
+ * @dir: Add link to this directory
+ * @dentry: The name of the link
+ *
+ * Link the inode in "old_dentry" into the directory "dir" with the
+ * name in "dentry".
+ *
+ * Returns: errno
+ */
+
+static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
+		     struct dentry *dentry)
+{
+	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_sbd *sdp = GFS2_SB(dir);
+	struct inode *inode = old_dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder ghs[2];
+	int alloc_required;
+	int error;
+
+	if (S_ISDIR(ip->i_di.di_mode))
+		return -EPERM;
+
+	gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
+
+	error = gfs2_glock_nq_m(2, ghs);
+	if (error)
+		goto out;
+
+	error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
+	if (error)
+		goto out_gunlock;
+
+	error = gfs2_dir_search(dir, &dentry->d_name, NULL, NULL);
+	switch (error) {
+	case -ENOENT:
+		break;
+	case 0:
+		error = -EEXIST;
+	default:
+		goto out_gunlock;
+	}
+
+	error = -EINVAL;
+	if (!dip->i_di.di_nlink)
+		goto out_gunlock;
+	error = -EFBIG;
+	if (dip->i_di.di_entries == (u32)-1)
+		goto out_gunlock;
+	error = -EPERM;
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		goto out_gunlock;
+	error = -EINVAL;
+	if (!ip->i_di.di_nlink)
+		goto out_gunlock;
+	error = -EMLINK;
+	if (ip->i_di.di_nlink == (u32)-1)
+		goto out_gunlock;
+
+	alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
+	if (error < 0)
+		goto out_gunlock;
+	error = 0;
+
+	if (alloc_required) {
+		struct gfs2_alloc *al = gfs2_alloc_get(dip);
+
+		error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+		if (error)
+			goto out_alloc;
+
+		error = gfs2_quota_check(dip, dip->i_di.di_uid,
+					 dip->i_di.di_gid);
+		if (error)
+			goto out_gunlock_q;
+
+		al->al_requested = sdp->sd_max_dirres;
+
+		error = gfs2_inplace_reserve(dip);
+		if (error)
+			goto out_gunlock_q;
+
+		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+					 al->al_rgd->rd_ri.ri_length +
+					 2 * RES_DINODE + RES_STATFS +
+					 RES_QUOTA, 0);
+		if (error)
+			goto out_ipres;
+	} else {
+		error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
+		if (error)
+			goto out_ipres;
+	}
+
+	error = gfs2_dir_add(dir, &dentry->d_name, &ip->i_num,
+			     IF2DT(ip->i_di.di_mode));
+	if (error)
+		goto out_end_trans;
+
+	error = gfs2_change_nlink(ip, +1);
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_ipres:
+	if (alloc_required)
+		gfs2_inplace_release(dip);
+out_gunlock_q:
+	if (alloc_required)
+		gfs2_quota_unlock(dip);
+out_alloc:
+	if (alloc_required)
+		gfs2_alloc_put(dip);
+out_gunlock:
+	gfs2_glock_dq_m(2, ghs);
+out:
+	gfs2_holder_uninit(ghs);
+	gfs2_holder_uninit(ghs + 1);
+	if (!error) {
+		atomic_inc(&inode->i_count);
+		d_instantiate(dentry, inode);
+		mark_inode_dirty(inode);
+	}
+	return error;
+}
+
+/**
+ * gfs2_unlink - Unlink a file
+ * @dir: The inode of the directory containing the file to unlink
+ * @dentry: The file itself
+ *
+ * Unlink a file.  Call gfs2_unlinki()
+ *
+ * Returns: errno
+ */
+
+static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_sbd *sdp = GFS2_SB(dir);
+	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+	struct gfs2_holder ghs[2];
+	int error;
+
+	gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
+
+	error = gfs2_glock_nq_m(2, ghs);
+	if (error)
+		goto out;
+
+	error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
+	if (error)
+		goto out_gunlock;
+
+	error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
+	if (error)
+		goto out_gunlock;
+
+	error = gfs2_dir_del(dip, &dentry->d_name);
+        if (error)
+                goto out_end_trans;
+
+	error = gfs2_change_nlink(ip, -1);
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_gunlock:
+	gfs2_glock_dq_m(2, ghs);
+out:
+	gfs2_holder_uninit(ghs);
+	gfs2_holder_uninit(ghs + 1);
+	return error;
+}
+
+/**
+ * gfs2_symlink - Create a symlink
+ * @dir: The directory to create the symlink in
+ * @dentry: The dentry to put the symlink in
+ * @symname: The thing which the link points to
+ *
+ * Returns: errno
+ */
+
+static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
+			const char *symname)
+{
+	struct gfs2_inode *dip = GFS2_I(dir), *ip;
+	struct gfs2_sbd *sdp = GFS2_SB(dir);
+	struct gfs2_holder ghs[2];
+	struct inode *inode;
+	struct buffer_head *dibh;
+	int size;
+	int error;
+
+	/* Must be stuffed with a null terminator for gfs2_follow_link() */
+	size = strlen(symname);
+	if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
+		return -ENAMETOOLONG;
+
+	gfs2_holder_init(dip->i_gl, 0, 0, ghs);
+
+	inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO);
+	if (IS_ERR(inode)) {
+		gfs2_holder_uninit(ghs);
+		return PTR_ERR(inode);
+	}
+
+	ip = ghs[1].gh_gl->gl_object;
+
+	ip->i_di.di_size = size;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+
+	if (!gfs2_assert_withdraw(sdp, !error)) {
+		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname,
+		       size);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(sdp);
+	if (dip->i_alloc.al_rgd)
+		gfs2_inplace_release(dip);
+	gfs2_quota_unlock(dip);
+	gfs2_alloc_put(dip);
+
+	gfs2_glock_dq_uninit_m(2, ghs);
+
+	d_instantiate(dentry, inode);
+	mark_inode_dirty(inode);
+
+	return 0;
+}
+
+/**
+ * gfs2_mkdir - Make a directory
+ * @dir: The parent directory of the new one
+ * @dentry: The dentry of the new directory
+ * @mode: The mode of the new directory
+ *
+ * Returns: errno
+ */
+
+static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct gfs2_inode *dip = GFS2_I(dir), *ip;
+	struct gfs2_sbd *sdp = GFS2_SB(dir);
+	struct gfs2_holder ghs[2];
+	struct inode *inode;
+	struct buffer_head *dibh;
+	int error;
+
+	gfs2_holder_init(dip->i_gl, 0, 0, ghs);
+
+	inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode);
+	if (IS_ERR(inode)) {
+		gfs2_holder_uninit(ghs);
+		return PTR_ERR(inode);
+	}
+
+	ip = ghs[1].gh_gl->gl_object;
+
+	ip->i_di.di_nlink = 2;
+	ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+	ip->i_di.di_flags |= GFS2_DIF_JDATA;
+	ip->i_di.di_payload_format = GFS2_FORMAT_DE;
+	ip->i_di.di_entries = 2;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+
+	if (!gfs2_assert_withdraw(sdp, !error)) {
+		struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
+		struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
+		struct qstr str;
+
+		gfs2_str2qstr(&str, ".");
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent);
+		dent->de_inum = di->di_num; /* already GFS2 endian */
+		dent->de_type = cpu_to_be16(DT_DIR);
+		di->di_entries = cpu_to_be32(1);
+
+		gfs2_str2qstr(&str, "..");
+		dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
+		gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
+
+		gfs2_inum_out(&dip->i_num, &dent->de_inum);
+		dent->de_type = cpu_to_be16(DT_DIR);
+
+		gfs2_dinode_out(&ip->i_di, di);
+
+		brelse(dibh);
+	}
+
+	error = gfs2_change_nlink(dip, +1);
+	gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
+
+	gfs2_trans_end(sdp);
+	if (dip->i_alloc.al_rgd)
+		gfs2_inplace_release(dip);
+	gfs2_quota_unlock(dip);
+	gfs2_alloc_put(dip);
+
+	gfs2_glock_dq_uninit_m(2, ghs);
+
+	d_instantiate(dentry, inode);
+	mark_inode_dirty(inode);
+
+	return 0;
+}
+
+/**
+ * gfs2_rmdir - Remove a directory
+ * @dir: The parent directory of the directory to be removed
+ * @dentry: The dentry of the directory to remove
+ *
+ * Remove a directory. Call gfs2_rmdiri()
+ *
+ * Returns: errno
+ */
+
+static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_sbd *sdp = GFS2_SB(dir);
+	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+	struct gfs2_holder ghs[2];
+	int error;
+
+	gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
+
+	error = gfs2_glock_nq_m(2, ghs);
+	if (error)
+		goto out;
+
+	error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
+	if (error)
+		goto out_gunlock;
+
+	if (ip->i_di.di_entries < 2) {
+		if (gfs2_consist_inode(ip))
+			gfs2_dinode_print(&ip->i_di);
+		error = -EIO;
+		goto out_gunlock;
+	}
+	if (ip->i_di.di_entries > 2) {
+		error = -ENOTEMPTY;
+		goto out_gunlock;
+	}
+
+	error = gfs2_trans_begin(sdp, 2 * RES_DINODE + 3 * RES_LEAF + RES_RG_BIT, 0);
+	if (error)
+		goto out_gunlock;
+
+	error = gfs2_rmdiri(dip, &dentry->d_name, ip);
+
+	gfs2_trans_end(sdp);
+
+out_gunlock:
+	gfs2_glock_dq_m(2, ghs);
+out:
+	gfs2_holder_uninit(ghs);
+	gfs2_holder_uninit(ghs + 1);
+	return error;
+}
+
+/**
+ * gfs2_mknod - Make a special file
+ * @dir: The directory in which the special file will reside
+ * @dentry: The dentry of the special file
+ * @mode: The mode of the special file
+ * @rdev: The device specification of the special file
+ *
+ */
+
+static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
+		      dev_t dev)
+{
+	struct gfs2_inode *dip = GFS2_I(dir), *ip;
+	struct gfs2_sbd *sdp = GFS2_SB(dir);
+	struct gfs2_holder ghs[2];
+	struct inode *inode;
+	struct buffer_head *dibh;
+	u32 major = 0, minor = 0;
+	int error;
+
+	switch (mode & S_IFMT) {
+	case S_IFBLK:
+	case S_IFCHR:
+		major = MAJOR(dev);
+		minor = MINOR(dev);
+		break;
+	case S_IFIFO:
+	case S_IFSOCK:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	};
+
+	gfs2_holder_init(dip->i_gl, 0, 0, ghs);
+
+	inode = gfs2_createi(ghs, &dentry->d_name, mode);
+	if (IS_ERR(inode)) {
+		gfs2_holder_uninit(ghs);
+		return PTR_ERR(inode);
+	}
+
+	ip = ghs[1].gh_gl->gl_object;
+
+	ip->i_di.di_major = major;
+	ip->i_di.di_minor = minor;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+
+	if (!gfs2_assert_withdraw(sdp, !error)) {
+		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		brelse(dibh);
+	}
+
+	gfs2_trans_end(sdp);
+	if (dip->i_alloc.al_rgd)
+		gfs2_inplace_release(dip);
+	gfs2_quota_unlock(dip);
+	gfs2_alloc_put(dip);
+
+	gfs2_glock_dq_uninit_m(2, ghs);
+
+	d_instantiate(dentry, inode);
+	mark_inode_dirty(inode);
+
+	return 0;
+}
+
+/**
+ * gfs2_rename - Rename a file
+ * @odir: Parent directory of old file name
+ * @odentry: The old dentry of the file
+ * @ndir: Parent directory of new file name
+ * @ndentry: The new dentry of the file
+ *
+ * Returns: errno
+ */
+
+static int gfs2_rename(struct inode *odir, struct dentry *odentry,
+		       struct inode *ndir, struct dentry *ndentry)
+{
+	struct gfs2_inode *odip = GFS2_I(odir);
+	struct gfs2_inode *ndip = GFS2_I(ndir);
+	struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
+	struct gfs2_inode *nip = NULL;
+	struct gfs2_sbd *sdp = GFS2_SB(odir);
+	struct gfs2_holder ghs[4], r_gh;
+	unsigned int num_gh;
+	int dir_rename = 0;
+	int alloc_required;
+	unsigned int x;
+	int error;
+
+	if (ndentry->d_inode) {
+		nip = GFS2_I(ndentry->d_inode);
+		if (ip == nip)
+			return 0;
+	}
+
+	/* Make sure we aren't trying to move a dirctory into it's subdir */
+
+	if (S_ISDIR(ip->i_di.di_mode) && odip != ndip) {
+		dir_rename = 1;
+
+		error = gfs2_glock_nq_init(sdp->sd_rename_gl,
+					   LM_ST_EXCLUSIVE, 0,
+					   &r_gh);
+		if (error)
+			goto out;
+
+		error = gfs2_ok_to_move(ip, ndip);
+		if (error)
+			goto out_gunlock_r;
+	}
+
+	num_gh = 1;
+	gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+	if (odip != ndip) {
+		gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+		num_gh++;
+	}
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+	num_gh++;
+
+	if (nip) {
+		gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+		num_gh++;
+	}
+
+	error = gfs2_glock_nq_m(num_gh, ghs);
+	if (error)
+		goto out_uninit;
+
+	/* Check out the old directory */
+
+	error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
+	if (error)
+		goto out_gunlock;
+
+	/* Check out the new directory */
+
+	if (nip) {
+		error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
+		if (error)
+			goto out_gunlock;
+
+		if (S_ISDIR(nip->i_di.di_mode)) {
+			if (nip->i_di.di_entries < 2) {
+				if (gfs2_consist_inode(nip))
+					gfs2_dinode_print(&nip->i_di);
+				error = -EIO;
+				goto out_gunlock;
+			}
+			if (nip->i_di.di_entries > 2) {
+				error = -ENOTEMPTY;
+				goto out_gunlock;
+			}
+		}
+	} else {
+		error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL);
+		if (error)
+			goto out_gunlock;
+
+		error = gfs2_dir_search(ndir, &ndentry->d_name, NULL, NULL);
+		switch (error) {
+		case -ENOENT:
+			error = 0;
+			break;
+		case 0:
+			error = -EEXIST;
+		default:
+			goto out_gunlock;
+		};
+
+		if (odip != ndip) {
+			if (!ndip->i_di.di_nlink) {
+				error = -EINVAL;
+				goto out_gunlock;
+			}
+			if (ndip->i_di.di_entries == (u32)-1) {
+				error = -EFBIG;
+				goto out_gunlock;
+			}
+			if (S_ISDIR(ip->i_di.di_mode) &&
+			    ndip->i_di.di_nlink == (u32)-1) {
+				error = -EMLINK;
+				goto out_gunlock;
+			}
+		}
+	}
+
+	/* Check out the dir to be renamed */
+
+	if (dir_rename) {
+		error = permission(odentry->d_inode, MAY_WRITE, NULL);
+		if (error)
+			goto out_gunlock;
+	}
+
+	alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
+	if (error < 0)
+		goto out_gunlock;
+	error = 0;
+
+	if (alloc_required) {
+		struct gfs2_alloc *al = gfs2_alloc_get(ndip);
+
+		error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+		if (error)
+			goto out_alloc;
+
+		error = gfs2_quota_check(ndip, ndip->i_di.di_uid,
+					 ndip->i_di.di_gid);
+		if (error)
+			goto out_gunlock_q;
+
+		al->al_requested = sdp->sd_max_dirres;
+
+		error = gfs2_inplace_reserve(ndip);
+		if (error)
+			goto out_gunlock_q;
+
+		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+					 al->al_rgd->rd_ri.ri_length +
+					 4 * RES_DINODE + 4 * RES_LEAF +
+					 RES_STATFS + RES_QUOTA, 0);
+		if (error)
+			goto out_ipreserv;
+	} else {
+		error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
+					 5 * RES_LEAF, 0);
+		if (error)
+			goto out_gunlock;
+	}
+
+	/* Remove the target file, if it exists */
+
+	if (nip) {
+		if (S_ISDIR(nip->i_di.di_mode))
+			error = gfs2_rmdiri(ndip, &ndentry->d_name, nip);
+		else {
+			error = gfs2_dir_del(ndip, &ndentry->d_name);
+			if (error)
+				goto out_end_trans;
+			error = gfs2_change_nlink(nip, -1);
+		}
+		if (error)
+			goto out_end_trans;
+	}
+
+	if (dir_rename) {
+		struct qstr name;
+		gfs2_str2qstr(&name, "..");
+
+		error = gfs2_change_nlink(ndip, +1);
+		if (error)
+			goto out_end_trans;
+		error = gfs2_change_nlink(odip, -1);
+		if (error)
+			goto out_end_trans;
+
+		error = gfs2_dir_mvino(ip, &name, &ndip->i_num, DT_DIR);
+		if (error)
+			goto out_end_trans;
+	} else {
+		struct buffer_head *dibh;
+		error = gfs2_meta_inode_buffer(ip, &dibh);
+		if (error)
+			goto out_end_trans;
+		ip->i_di.di_ctime = get_seconds();
+		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+		gfs2_dinode_out(&ip->i_di, dibh->b_data);
+		brelse(dibh);
+	}
+
+	error = gfs2_dir_del(odip, &odentry->d_name);
+	if (error)
+		goto out_end_trans;
+
+	error = gfs2_dir_add(ndir, &ndentry->d_name, &ip->i_num,
+			     IF2DT(ip->i_di.di_mode));
+	if (error)
+		goto out_end_trans;
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_ipreserv:
+	if (alloc_required)
+		gfs2_inplace_release(ndip);
+out_gunlock_q:
+	if (alloc_required)
+		gfs2_quota_unlock(ndip);
+out_alloc:
+	if (alloc_required)
+		gfs2_alloc_put(ndip);
+out_gunlock:
+	gfs2_glock_dq_m(num_gh, ghs);
+out_uninit:
+	for (x = 0; x < num_gh; x++)
+		gfs2_holder_uninit(ghs + x);
+out_gunlock_r:
+	if (dir_rename)
+		gfs2_glock_dq_uninit(&r_gh);
+out:
+	return error;
+}
+
+/**
+ * gfs2_readlink - Read the value of a symlink
+ * @dentry: the symlink
+ * @buf: the buffer to read the symlink data into
+ * @size: the size of the buffer
+ *
+ * Returns: errno
+ */
+
+static int gfs2_readlink(struct dentry *dentry, char __user *user_buf,
+			 int user_size)
+{
+	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+	char array[GFS2_FAST_NAME_SIZE], *buf = array;
+	unsigned int len = GFS2_FAST_NAME_SIZE;
+	int error;
+
+	error = gfs2_readlinki(ip, &buf, &len);
+	if (error)
+		return error;
+
+	if (user_size > len - 1)
+		user_size = len - 1;
+
+	if (copy_to_user(user_buf, buf, user_size))
+		error = -EFAULT;
+	else
+		error = user_size;
+
+	if (buf != array)
+		kfree(buf);
+
+	return error;
+}
+
+/**
+ * gfs2_follow_link - Follow a symbolic link
+ * @dentry: The dentry of the link
+ * @nd: Data that we pass to vfs_follow_link()
+ *
+ * This can handle symlinks of any size. It is optimised for symlinks
+ * under GFS2_FAST_NAME_SIZE.
+ *
+ * Returns: 0 on success or error code
+ */
+
+static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+	char array[GFS2_FAST_NAME_SIZE], *buf = array;
+	unsigned int len = GFS2_FAST_NAME_SIZE;
+	int error;
+
+	error = gfs2_readlinki(ip, &buf, &len);
+	if (!error) {
+		error = vfs_follow_link(nd, buf);
+		if (buf != array)
+			kfree(buf);
+	}
+
+	return ERR_PTR(error);
+}
+
+/**
+ * gfs2_permission -
+ * @inode:
+ * @mask:
+ * @nd: passed from Linux VFS, ignored by us
+ *
+ * Returns: errno
+ */
+
+static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder i_gh;
+	int error;
+
+	if (ip->i_vn == ip->i_gl->gl_vn)
+		return generic_permission(inode, mask, gfs2_check_acl);
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+	if (!error) {
+		error = generic_permission(inode, mask, gfs2_check_acl_locked);
+		gfs2_glock_dq_uninit(&i_gh);
+	}
+
+	return error;
+}
+
+static int setattr_size(struct inode *inode, struct iattr *attr)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	int error;
+
+	if (attr->ia_size != ip->i_di.di_size) {
+		error = vmtruncate(inode, attr->ia_size);
+		if (error)
+			return error;
+	}
+
+	error = gfs2_truncatei(ip, attr->ia_size);
+	if (error)
+		return error;
+
+	return error;
+}
+
+static int setattr_chown(struct inode *inode, struct iattr *attr)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct buffer_head *dibh;
+	u32 ouid, ogid, nuid, ngid;
+	int error;
+
+	ouid = ip->i_di.di_uid;
+	ogid = ip->i_di.di_gid;
+	nuid = attr->ia_uid;
+	ngid = attr->ia_gid;
+
+	if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
+		ouid = nuid = NO_QUOTA_CHANGE;
+	if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
+		ogid = ngid = NO_QUOTA_CHANGE;
+
+	gfs2_alloc_get(ip);
+
+	error = gfs2_quota_lock(ip, nuid, ngid);
+	if (error)
+		goto out_alloc;
+
+	if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+		error = gfs2_quota_check(ip, nuid, ngid);
+		if (error)
+			goto out_gunlock_q;
+	}
+
+	error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
+	if (error)
+		goto out_gunlock_q;
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out_end_trans;
+
+	error = inode_setattr(inode, attr);
+	gfs2_assert_warn(sdp, !error);
+	gfs2_inode_attr_out(ip);
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(&ip->i_di, dibh->b_data);
+	brelse(dibh);
+
+	if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+		gfs2_quota_change(ip, -ip->i_di.di_blocks, ouid, ogid);
+		gfs2_quota_change(ip, ip->i_di.di_blocks, nuid, ngid);
+	}
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_gunlock_q:
+	gfs2_quota_unlock(ip);
+out_alloc:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+/**
+ * gfs2_setattr - Change attributes on an inode
+ * @dentry: The dentry which is changing
+ * @attr: The structure describing the change
+ *
+ * The VFS layer wants to change one or more of an inodes attributes.  Write
+ * that change out to disk.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder i_gh;
+	int error;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+	if (error)
+		return error;
+
+	error = -EPERM;
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		goto out;
+
+	error = inode_change_ok(inode, attr);
+	if (error)
+		goto out;
+
+	if (attr->ia_valid & ATTR_SIZE)
+		error = setattr_size(inode, attr);
+	else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
+		error = setattr_chown(inode, attr);
+	else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
+		error = gfs2_acl_chmod(ip, attr);
+	else
+		error = gfs2_setattr_simple(ip, attr);
+
+out:
+	gfs2_glock_dq_uninit(&i_gh);
+	if (!error)
+		mark_inode_dirty(inode);
+	return error;
+}
+
+/**
+ * gfs2_getattr - Read out an inode's attributes
+ * @mnt: The vfsmount the inode is being accessed from
+ * @dentry: The dentry to stat
+ * @stat: The inode's stats
+ *
+ * Returns: errno
+ */
+
+static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int error;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+	if (!error) {
+		generic_fillattr(inode, stat);
+		gfs2_glock_dq_uninit(&gh);
+	}
+
+	return error;
+}
+
+static int gfs2_setxattr(struct dentry *dentry, const char *name,
+			 const void *data, size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_ea_request er;
+
+	memset(&er, 0, sizeof(struct gfs2_ea_request));
+	er.er_type = gfs2_ea_name2type(name, &er.er_name);
+	if (er.er_type == GFS2_EATYPE_UNUSED)
+		return -EOPNOTSUPP;
+	er.er_data = (char *)data;
+	er.er_name_len = strlen(er.er_name);
+	er.er_data_len = size;
+	er.er_flags = flags;
+
+	gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE));
+
+	return gfs2_ea_set(GFS2_I(inode), &er);
+}
+
+static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
+			     void *data, size_t size)
+{
+	struct gfs2_ea_request er;
+
+	memset(&er, 0, sizeof(struct gfs2_ea_request));
+	er.er_type = gfs2_ea_name2type(name, &er.er_name);
+	if (er.er_type == GFS2_EATYPE_UNUSED)
+		return -EOPNOTSUPP;
+	er.er_data = data;
+	er.er_name_len = strlen(er.er_name);
+	er.er_data_len = size;
+
+	return gfs2_ea_get(GFS2_I(dentry->d_inode), &er);
+}
+
+static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct gfs2_ea_request er;
+
+	memset(&er, 0, sizeof(struct gfs2_ea_request));
+	er.er_data = (size) ? buffer : NULL;
+	er.er_data_len = size;
+
+	return gfs2_ea_list(GFS2_I(dentry->d_inode), &er);
+}
+
+static int gfs2_removexattr(struct dentry *dentry, const char *name)
+{
+	struct gfs2_ea_request er;
+
+	memset(&er, 0, sizeof(struct gfs2_ea_request));
+	er.er_type = gfs2_ea_name2type(name, &er.er_name);
+	if (er.er_type == GFS2_EATYPE_UNUSED)
+		return -EOPNOTSUPP;
+	er.er_name_len = strlen(er.er_name);
+
+	return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
+}
+
+struct inode_operations gfs2_file_iops = {
+	.permission = gfs2_permission,
+	.setattr = gfs2_setattr,
+	.getattr = gfs2_getattr,
+	.setxattr = gfs2_setxattr,
+	.getxattr = gfs2_getxattr,
+	.listxattr = gfs2_listxattr,
+	.removexattr = gfs2_removexattr,
+};
+
+struct inode_operations gfs2_dev_iops = {
+	.permission = gfs2_permission,
+	.setattr = gfs2_setattr,
+	.getattr = gfs2_getattr,
+	.setxattr = gfs2_setxattr,
+	.getxattr = gfs2_getxattr,
+	.listxattr = gfs2_listxattr,
+	.removexattr = gfs2_removexattr,
+};
+
+struct inode_operations gfs2_dir_iops = {
+	.create = gfs2_create,
+	.lookup = gfs2_lookup,
+	.link = gfs2_link,
+	.unlink = gfs2_unlink,
+	.symlink = gfs2_symlink,
+	.mkdir = gfs2_mkdir,
+	.rmdir = gfs2_rmdir,
+	.mknod = gfs2_mknod,
+	.rename = gfs2_rename,
+	.permission = gfs2_permission,
+	.setattr = gfs2_setattr,
+	.getattr = gfs2_getattr,
+	.setxattr = gfs2_setxattr,
+	.getxattr = gfs2_getxattr,
+	.listxattr = gfs2_listxattr,
+	.removexattr = gfs2_removexattr,
+};
+
+struct inode_operations gfs2_symlink_iops = {
+	.readlink = gfs2_readlink,
+	.follow_link = gfs2_follow_link,
+	.permission = gfs2_permission,
+	.setattr = gfs2_setattr,
+	.getattr = gfs2_getattr,
+	.setxattr = gfs2_setxattr,
+	.getxattr = gfs2_getxattr,
+	.listxattr = gfs2_listxattr,
+	.removexattr = gfs2_removexattr,
+};
+
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
new file mode 100644
index 000000000000..b15acb4fd34c
--- /dev/null
+++ b/fs/gfs2/ops_inode.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __OPS_INODE_DOT_H__
+#define __OPS_INODE_DOT_H__
+
+#include <linux/fs.h>
+
+extern struct inode_operations gfs2_file_iops;
+extern struct inode_operations gfs2_dir_iops;
+extern struct inode_operations gfs2_symlink_iops;
+extern struct inode_operations gfs2_dev_iops;
+
+#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
new file mode 100644
index 000000000000..06f06f7773d0
--- /dev/null
+++ b/fs/gfs2/ops_super.c
@@ -0,0 +1,468 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/statfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "inode.h"
+#include "lm.h"
+#include "log.h"
+#include "mount.h"
+#include "ops_super.h"
+#include "quota.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "super.h"
+#include "sys.h"
+#include "util.h"
+#include "trans.h"
+#include "dir.h"
+#include "eattr.h"
+#include "bmap.h"
+
+/**
+ * gfs2_write_inode - Make sure the inode is stable on the disk
+ * @inode: The inode
+ * @sync: synchronous write flag
+ *
+ * Returns: errno
+ */
+
+static int gfs2_write_inode(struct inode *inode, int sync)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	/* Check this is a "normal" inode */
+	if (inode->i_private) {
+		if (current->flags & PF_MEMALLOC)
+			return 0;
+		if (sync)
+			gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_put_super - Unmount the filesystem
+ * @sb: The VFS superblock
+ *
+ */
+
+static void gfs2_put_super(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	int error;
+
+	if (!sdp)
+		return;
+
+	if (!strncmp(sb->s_type->name, "gfs2meta", 8))
+		return; /* Nothing to do */
+
+	/*  Unfreeze the filesystem, if we need to  */
+
+	mutex_lock(&sdp->sd_freeze_lock);
+	if (sdp->sd_freeze_count)
+		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+	mutex_unlock(&sdp->sd_freeze_lock);
+
+	kthread_stop(sdp->sd_quotad_process);
+	kthread_stop(sdp->sd_logd_process);
+	kthread_stop(sdp->sd_recoverd_process);
+	while (sdp->sd_glockd_num--)
+		kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
+	kthread_stop(sdp->sd_scand_process);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		error = gfs2_make_fs_ro(sdp);
+		if (error)
+			gfs2_io_error(sdp);
+	}
+	/*  At this point, we're through modifying the disk  */
+
+	/*  Release stuff  */
+
+	iput(sdp->sd_master_dir);
+	iput(sdp->sd_jindex);
+	iput(sdp->sd_inum_inode);
+	iput(sdp->sd_statfs_inode);
+	iput(sdp->sd_rindex);
+	iput(sdp->sd_quota_inode);
+
+	gfs2_glock_put(sdp->sd_rename_gl);
+	gfs2_glock_put(sdp->sd_trans_gl);
+
+	if (!sdp->sd_args.ar_spectator) {
+		gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+		iput(sdp->sd_ir_inode);
+		iput(sdp->sd_sc_inode);
+		iput(sdp->sd_qc_inode);
+	}
+
+	gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+	gfs2_clear_rgrpd(sdp);
+	gfs2_jindex_free(sdp);
+	/*  Take apart glock structures and buffer lists  */
+	gfs2_gl_hash_clear(sdp, WAIT);
+	/*  Unmount the locking protocol  */
+	gfs2_lm_unmount(sdp);
+
+	/*  At this point, we're through participating in the lockspace  */
+	gfs2_sys_fs_del(sdp);
+	kfree(sdp);
+}
+
+/**
+ * gfs2_write_super - disk commit all incore transactions
+ * @sb: the filesystem
+ *
+ * This function is called every time sync(2) is called.
+ * After this exits, all dirty buffers are synced.
+ */
+
+static void gfs2_write_super(struct super_block *sb)
+{
+	gfs2_log_flush(sb->s_fs_info, NULL);
+}
+
+/**
+ * gfs2_write_super_lockfs - prevent further writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+
+static void gfs2_write_super_lockfs(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	int error;
+
+	for (;;) {
+		error = gfs2_freeze_fs(sdp);
+		if (!error)
+			break;
+
+		switch (error) {
+		case -EBUSY:
+			fs_err(sdp, "waiting for recovery before freeze\n");
+			break;
+
+		default:
+			fs_err(sdp, "error freezing FS: %d\n", error);
+			break;
+		}
+
+		fs_err(sdp, "retrying...\n");
+		msleep(1000);
+	}
+}
+
+/**
+ * gfs2_unlockfs - reallow writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+
+static void gfs2_unlockfs(struct super_block *sb)
+{
+	gfs2_unfreeze_fs(sb->s_fs_info);
+}
+
+/**
+ * gfs2_statfs - Gather and return stats about the filesystem
+ * @sb: The superblock
+ * @statfsbuf: The buffer
+ *
+ * Returns: 0 on success or error code
+ */
+
+static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_inode->i_sb;
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_statfs_change sc;
+	int error;
+
+	if (gfs2_tune_get(sdp, gt_statfs_slow))
+		error = gfs2_statfs_slow(sdp, &sc);
+	else
+		error = gfs2_statfs_i(sdp, &sc);
+
+	if (error)
+		return error;
+
+	buf->f_type = GFS2_MAGIC;
+	buf->f_bsize = sdp->sd_sb.sb_bsize;
+	buf->f_blocks = sc.sc_total;
+	buf->f_bfree = sc.sc_free;
+	buf->f_bavail = sc.sc_free;
+	buf->f_files = sc.sc_dinodes + sc.sc_free;
+	buf->f_ffree = sc.sc_free;
+	buf->f_namelen = GFS2_FNAMESIZE;
+
+	return 0;
+}
+
+/**
+ * gfs2_remount_fs - called when the FS is remounted
+ * @sb:  the filesystem
+ * @flags:  the remount flags
+ * @data:  extra data passed in (not used right now)
+ *
+ * Returns: errno
+ */
+
+static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	int error;
+
+	error = gfs2_mount_args(sdp, data, 1);
+	if (error)
+		return error;
+
+	if (sdp->sd_args.ar_spectator)
+		*flags |= MS_RDONLY;
+	else {
+		if (*flags & MS_RDONLY) {
+			if (!(sb->s_flags & MS_RDONLY))
+				error = gfs2_make_fs_ro(sdp);
+		} else if (!(*flags & MS_RDONLY) &&
+			   (sb->s_flags & MS_RDONLY)) {
+			error = gfs2_make_fs_rw(sdp);
+		}
+	}
+
+	if (*flags & (MS_NOATIME | MS_NODIRATIME))
+		set_bit(SDF_NOATIME, &sdp->sd_flags);
+	else
+		clear_bit(SDF_NOATIME, &sdp->sd_flags);
+
+	/* Don't let the VFS update atimes.  GFS2 handles this itself. */
+	*flags |= MS_NOATIME | MS_NODIRATIME;
+
+	return error;
+}
+
+/**
+ * gfs2_clear_inode - Deallocate an inode when VFS is done with it
+ * @inode: The VFS inode
+ *
+ */
+
+static void gfs2_clear_inode(struct inode *inode)
+{
+	/* This tells us its a "real" inode and not one which only
+	 * serves to contain an address space (see rgrp.c, meta_io.c)
+	 * which therefore doesn't have its own glocks.
+	 */
+	if (inode->i_private) {
+		struct gfs2_inode *ip = GFS2_I(inode);
+		gfs2_glock_inode_squish(inode);
+		gfs2_assert(inode->i_sb->s_fs_info, ip->i_gl->gl_state == LM_ST_UNLOCKED);
+		ip->i_gl->gl_object = NULL;
+		gfs2_glock_schedule_for_reclaim(ip->i_gl);
+		gfs2_glock_put(ip->i_gl);
+		ip->i_gl = NULL;
+		if (ip->i_iopen_gh.gh_gl)
+			gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+	}
+}
+
+/**
+ * gfs2_show_options - Show mount options for /proc/mounts
+ * @s: seq_file structure
+ * @mnt: vfsmount
+ *
+ * Returns: 0 on success or error code
+ */
+
+static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+	struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
+	struct gfs2_args *args = &sdp->sd_args;
+
+	if (args->ar_lockproto[0])
+		seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+	if (args->ar_locktable[0])
+		seq_printf(s, ",locktable=%s", args->ar_locktable);
+	if (args->ar_hostdata[0])
+		seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+	if (args->ar_spectator)
+		seq_printf(s, ",spectator");
+	if (args->ar_ignore_local_fs)
+		seq_printf(s, ",ignore_local_fs");
+	if (args->ar_localflocks)
+		seq_printf(s, ",localflocks");
+	if (args->ar_localcaching)
+		seq_printf(s, ",localcaching");
+	if (args->ar_debug)
+		seq_printf(s, ",debug");
+	if (args->ar_upgrade)
+		seq_printf(s, ",upgrade");
+	if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
+		seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
+	if (args->ar_posix_acl)
+		seq_printf(s, ",acl");
+	if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
+		char *state;
+		switch (args->ar_quota) {
+		case GFS2_QUOTA_OFF:
+			state = "off";
+			break;
+		case GFS2_QUOTA_ACCOUNT:
+			state = "account";
+			break;
+		case GFS2_QUOTA_ON:
+			state = "on";
+			break;
+		default:
+			state = "unknown";
+			break;
+		}
+		seq_printf(s, ",quota=%s", state);
+	}
+	if (args->ar_suiddir)
+		seq_printf(s, ",suiddir");
+	if (args->ar_data != GFS2_DATA_DEFAULT) {
+		char *state;
+		switch (args->ar_data) {
+		case GFS2_DATA_WRITEBACK:
+			state = "writeback";
+			break;
+		case GFS2_DATA_ORDERED:
+			state = "ordered";
+			break;
+		default:
+			state = "unknown";
+			break;
+		}
+		seq_printf(s, ",data=%s", state);
+	}
+
+	return 0;
+}
+
+/*
+ * We have to (at the moment) hold the inodes main lock to cover
+ * the gap between unlocking the shared lock on the iopen lock and
+ * taking the exclusive lock. I'd rather do a shared -> exclusive
+ * conversion on the iopen lock, but we can change that later. This
+ * is safe, just less efficient.
+ */
+static void gfs2_delete_inode(struct inode *inode)
+{
+	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int error;
+
+	if (!inode->i_private)
+		goto out;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &gh);
+	if (unlikely(error)) {
+		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+		goto out;
+	}
+
+	gfs2_glock_dq(&ip->i_iopen_gh);
+	gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
+	error = gfs2_glock_nq(&ip->i_iopen_gh);
+	if (error)
+		goto out_uninit;
+
+	if (S_ISDIR(ip->i_di.di_mode) &&
+	    (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+		error = gfs2_dir_exhash_dealloc(ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	if (ip->i_di.di_eattr) {
+		error = gfs2_ea_dealloc(ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	if (!gfs2_is_stuffed(ip)) {
+		error = gfs2_file_dealloc(ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	error = gfs2_dinode_dealloc(ip);
+
+out_unlock:
+	gfs2_glock_dq(&ip->i_iopen_gh);
+out_uninit:
+	gfs2_holder_uninit(&ip->i_iopen_gh);
+	gfs2_glock_dq_uninit(&gh);
+	if (error)
+		fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
+out:
+	truncate_inode_pages(&inode->i_data, 0);
+	clear_inode(inode);
+}
+
+
+
+static struct inode *gfs2_alloc_inode(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_inode *ip;
+
+	ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
+	if (ip) {
+		ip->i_flags = 0;
+		ip->i_gl = NULL;
+		ip->i_greedy = gfs2_tune_get(sdp, gt_greedy_default);
+		ip->i_last_pfault = jiffies;
+	}
+	return &ip->i_inode;
+}
+
+static void gfs2_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(gfs2_inode_cachep, inode);
+}
+
+struct super_operations gfs2_super_ops = {
+	.alloc_inode = gfs2_alloc_inode,
+	.destroy_inode = gfs2_destroy_inode,
+	.write_inode = gfs2_write_inode,
+	.delete_inode = gfs2_delete_inode,
+	.put_super = gfs2_put_super,
+	.write_super = gfs2_write_super,
+	.write_super_lockfs = gfs2_write_super_lockfs,
+	.unlockfs = gfs2_unlockfs,
+	.statfs = gfs2_statfs,
+	.remount_fs = gfs2_remount_fs,
+	.clear_inode = gfs2_clear_inode,
+	.show_options = gfs2_show_options,
+};
+
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
new file mode 100644
index 000000000000..9de73f042f78
--- /dev/null
+++ b/fs/gfs2/ops_super.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __OPS_SUPER_DOT_H__
+#define __OPS_SUPER_DOT_H__
+
+#include <linux/fs.h>
+
+extern struct super_operations gfs2_super_ops;
+
+#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
new file mode 100644
index 000000000000..5453d2947ab3
--- /dev/null
+++ b/fs/gfs2/ops_vm.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "ops_vm.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+
+static void pfault_be_greedy(struct gfs2_inode *ip)
+{
+	unsigned int time;
+
+	spin_lock(&ip->i_spin);
+	time = ip->i_greedy;
+	ip->i_last_pfault = jiffies;
+	spin_unlock(&ip->i_spin);
+
+	igrab(&ip->i_inode);
+	if (gfs2_glock_be_greedy(ip->i_gl, time))
+		iput(&ip->i_inode);
+}
+
+static struct page *gfs2_private_nopage(struct vm_area_struct *area,
+					unsigned long address, int *type)
+{
+	struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host);
+	struct page *result;
+
+	set_bit(GIF_PAGED, &ip->i_flags);
+
+	result = filemap_nopage(area, address, type);
+
+	if (result && result != NOPAGE_OOM)
+		pfault_be_greedy(ip);
+
+	return result;
+}
+
+static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	unsigned long index = page->index;
+	u64 lblock = index << (PAGE_CACHE_SHIFT -
+				    sdp->sd_sb.sb_bsize_shift);
+	unsigned int blocks = PAGE_CACHE_SIZE >> sdp->sd_sb.sb_bsize_shift;
+	struct gfs2_alloc *al;
+	unsigned int data_blocks, ind_blocks;
+	unsigned int x;
+	int error;
+
+	al = gfs2_alloc_get(ip);
+
+	error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+	if (error)
+		goto out;
+
+	error = gfs2_quota_check(ip, ip->i_di.di_uid, ip->i_di.di_gid);
+	if (error)
+		goto out_gunlock_q;
+
+	gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
+
+	al->al_requested = data_blocks + ind_blocks;
+
+	error = gfs2_inplace_reserve(ip);
+	if (error)
+		goto out_gunlock_q;
+
+	error = gfs2_trans_begin(sdp, al->al_rgd->rd_ri.ri_length +
+				 ind_blocks + RES_DINODE +
+				 RES_STATFS + RES_QUOTA, 0);
+	if (error)
+		goto out_ipres;
+
+	if (gfs2_is_stuffed(ip)) {
+		error = gfs2_unstuff_dinode(ip, NULL);
+		if (error)
+			goto out_trans;
+	}
+
+	for (x = 0; x < blocks; ) {
+		u64 dblock;
+		unsigned int extlen;
+		int new = 1;
+
+		error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen);
+		if (error)
+			goto out_trans;
+
+		lblock += extlen;
+		x += extlen;
+	}
+
+	gfs2_assert_warn(sdp, al->al_alloced);
+
+out_trans:
+	gfs2_trans_end(sdp);
+out_ipres:
+	gfs2_inplace_release(ip);
+out_gunlock_q:
+	gfs2_quota_unlock(ip);
+out:
+	gfs2_alloc_put(ip);
+	return error;
+}
+
+static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
+					   unsigned long address, int *type)
+{
+	struct file *file = area->vm_file;
+	struct gfs2_file *gf = file->private_data;
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+	struct gfs2_holder i_gh;
+	struct page *result = NULL;
+	unsigned long index = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) +
+			      area->vm_pgoff;
+	int alloc_required;
+	int error;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+	if (error)
+		return NULL;
+
+	set_bit(GIF_PAGED, &ip->i_flags);
+	set_bit(GIF_SW_PAGED, &ip->i_flags);
+
+	error = gfs2_write_alloc_required(ip, (u64)index << PAGE_CACHE_SHIFT,
+					  PAGE_CACHE_SIZE, &alloc_required);
+	if (error)
+		goto out;
+
+	set_bit(GFF_EXLOCK, &gf->f_flags);
+	result = filemap_nopage(area, address, type);
+	clear_bit(GFF_EXLOCK, &gf->f_flags);
+	if (!result || result == NOPAGE_OOM)
+		goto out;
+
+	if (alloc_required) {
+		error = alloc_page_backing(ip, result);
+		if (error) {
+			page_cache_release(result);
+			result = NULL;
+			goto out;
+		}
+		set_page_dirty(result);
+	}
+
+	pfault_be_greedy(ip);
+out:
+	gfs2_glock_dq_uninit(&i_gh);
+
+	return result;
+}
+
+struct vm_operations_struct gfs2_vm_ops_private = {
+	.nopage = gfs2_private_nopage,
+};
+
+struct vm_operations_struct gfs2_vm_ops_sharewrite = {
+	.nopage = gfs2_sharewrite_nopage,
+};
+
diff --git a/fs/gfs2/ops_vm.h b/fs/gfs2/ops_vm.h
new file mode 100644
index 000000000000..4ae8f43ed5e3
--- /dev/null
+++ b/fs/gfs2/ops_vm.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __OPS_VM_DOT_H__
+#define __OPS_VM_DOT_H__
+
+#include <linux/mm.h>
+
+extern struct vm_operations_struct gfs2_vm_ops_private;
+extern struct vm_operations_struct gfs2_vm_ops_sharewrite;
+
+#endif /* __OPS_VM_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
new file mode 100644
index 000000000000..a3deae7416c9
--- /dev/null
+++ b/fs/gfs2/quota.c
@@ -0,0 +1,1228 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+/*
+ * Quota change tags are associated with each transaction that allocates or
+ * deallocates space.  Those changes are accumulated locally to each node (in a
+ * per-node file) and then are periodically synced to the quota file.  This
+ * avoids the bottleneck of constantly touching the quota file, but introduces
+ * fuzziness in the current usage value of IDs that are being used on different
+ * nodes in the cluster simultaneously.  So, it is possible for a user on
+ * multiple nodes to overrun their quota, but that overrun is controlable.
+ * Since quota tags are part of transactions, there is no need to a quota check
+ * program to be run on node crashes or anything like that.
+ *
+ * There are couple of knobs that let the administrator manage the quota
+ * fuzziness.  "quota_quantum" sets the maximum time a quota change can be
+ * sitting on one node before being synced to the quota file.  (The default is
+ * 60 seconds.)  Another knob, "quota_scale" controls how quickly the frequency
+ * of quota file syncs increases as the user moves closer to their limit.  The
+ * more frequent the syncs, the more accurate the quota enforcement, but that
+ * means that there is more contention between the nodes for the quota file.
+ * The default value is one.  This sets the maximum theoretical quota overrun
+ * (with infinite node with infinite bandwidth) to twice the user's limit.  (In
+ * practice, the maximum overrun you see should be much less.)  A "quota_scale"
+ * number greater than one makes quota syncs more frequent and reduces the
+ * maximum overrun.  Numbers less than one (but greater than zero) make quota
+ * syncs less frequent.
+ *
+ * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of
+ * the quota file, so it is not being constantly read.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/sort.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "super.h"
+#include "trans.h"
+#include "inode.h"
+#include "ops_file.h"
+#include "ops_address.h"
+#include "util.h"
+
+#define QUOTA_USER 1
+#define QUOTA_GROUP 0
+
+static u64 qd2offset(struct gfs2_quota_data *qd)
+{
+	u64 offset;
+
+	offset = 2 * (u64)qd->qd_id + !test_bit(QDF_USER, &qd->qd_flags);
+	offset *= sizeof(struct gfs2_quota);
+
+	return offset;
+}
+
+static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
+		    struct gfs2_quota_data **qdp)
+{
+	struct gfs2_quota_data *qd;
+	int error;
+
+	qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL);
+	if (!qd)
+		return -ENOMEM;
+
+	qd->qd_count = 1;
+	qd->qd_id = id;
+	if (user)
+		set_bit(QDF_USER, &qd->qd_flags);
+	qd->qd_slot = -1;
+
+	error = gfs2_glock_get(sdp, 2 * (u64)id + !user,
+			      &gfs2_quota_glops, CREATE, &qd->qd_gl);
+	if (error)
+		goto fail;
+
+	error = gfs2_lvb_hold(qd->qd_gl);
+	gfs2_glock_put(qd->qd_gl);
+	if (error)
+		goto fail;
+
+	*qdp = qd;
+
+	return 0;
+
+fail:
+	kfree(qd);
+	return error;
+}
+
+static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+		  struct gfs2_quota_data **qdp)
+{
+	struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
+	int error, found;
+
+	*qdp = NULL;
+
+	for (;;) {
+		found = 0;
+		spin_lock(&sdp->sd_quota_spin);
+		list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
+			if (qd->qd_id == id &&
+			    !test_bit(QDF_USER, &qd->qd_flags) == !user) {
+				qd->qd_count++;
+				found = 1;
+				break;
+			}
+		}
+
+		if (!found)
+			qd = NULL;
+
+		if (!qd && new_qd) {
+			qd = new_qd;
+			list_add(&qd->qd_list, &sdp->sd_quota_list);
+			atomic_inc(&sdp->sd_quota_count);
+			new_qd = NULL;
+		}
+
+		spin_unlock(&sdp->sd_quota_spin);
+
+		if (qd || !create) {
+			if (new_qd) {
+				gfs2_lvb_unhold(new_qd->qd_gl);
+				kfree(new_qd);
+			}
+			*qdp = qd;
+			return 0;
+		}
+
+		error = qd_alloc(sdp, user, id, &new_qd);
+		if (error)
+			return error;
+	}
+}
+
+static void qd_hold(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+
+	spin_lock(&sdp->sd_quota_spin);
+	gfs2_assert(sdp, qd->qd_count);
+	qd->qd_count++;
+	spin_unlock(&sdp->sd_quota_spin);
+}
+
+static void qd_put(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	spin_lock(&sdp->sd_quota_spin);
+	gfs2_assert(sdp, qd->qd_count);
+	if (!--qd->qd_count)
+		qd->qd_last_touched = jiffies;
+	spin_unlock(&sdp->sd_quota_spin);
+}
+
+static int slot_get(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	unsigned int c, o = 0, b;
+	unsigned char byte = 0;
+
+	spin_lock(&sdp->sd_quota_spin);
+
+	if (qd->qd_slot_count++) {
+		spin_unlock(&sdp->sd_quota_spin);
+		return 0;
+	}
+
+	for (c = 0; c < sdp->sd_quota_chunks; c++)
+		for (o = 0; o < PAGE_SIZE; o++) {
+			byte = sdp->sd_quota_bitmap[c][o];
+			if (byte != 0xFF)
+				goto found;
+		}
+
+	goto fail;
+
+found:
+	for (b = 0; b < 8; b++)
+		if (!(byte & (1 << b)))
+			break;
+	qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
+
+	if (qd->qd_slot >= sdp->sd_quota_slots)
+		goto fail;
+
+	sdp->sd_quota_bitmap[c][o] |= 1 << b;
+
+	spin_unlock(&sdp->sd_quota_spin);
+
+	return 0;
+
+fail:
+	qd->qd_slot_count--;
+	spin_unlock(&sdp->sd_quota_spin);
+	return -ENOSPC;
+}
+
+static void slot_hold(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+
+	spin_lock(&sdp->sd_quota_spin);
+	gfs2_assert(sdp, qd->qd_slot_count);
+	qd->qd_slot_count++;
+	spin_unlock(&sdp->sd_quota_spin);
+}
+
+static void slot_put(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+
+	spin_lock(&sdp->sd_quota_spin);
+	gfs2_assert(sdp, qd->qd_slot_count);
+	if (!--qd->qd_slot_count) {
+		gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
+		qd->qd_slot = -1;
+	}
+	spin_unlock(&sdp->sd_quota_spin);
+}
+
+static int bh_get(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+	unsigned int block, offset;
+	struct buffer_head *bh;
+	int error;
+	struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+
+	mutex_lock(&sdp->sd_quota_mutex);
+
+	if (qd->qd_bh_count++) {
+		mutex_unlock(&sdp->sd_quota_mutex);
+		return 0;
+	}
+
+	block = qd->qd_slot / sdp->sd_qc_per_block;
+	offset = qd->qd_slot % sdp->sd_qc_per_block;;
+
+	bh_map.b_size = 1 << ip->i_inode.i_blkbits;
+	error = gfs2_block_map(&ip->i_inode, block, 0, &bh_map);
+	if (error)
+		goto fail;
+	error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh);
+	if (error)
+		goto fail;
+	error = -EIO;
+	if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC))
+		goto fail_brelse;
+
+	qd->qd_bh = bh;
+	qd->qd_bh_qc = (struct gfs2_quota_change *)
+		(bh->b_data + sizeof(struct gfs2_meta_header) +
+		 offset * sizeof(struct gfs2_quota_change));
+
+	mutex_lock(&sdp->sd_quota_mutex);
+
+	return 0;
+
+fail_brelse:
+	brelse(bh);
+fail:
+	qd->qd_bh_count--;
+	mutex_unlock(&sdp->sd_quota_mutex);
+	return error;
+}
+
+static void bh_put(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+
+	mutex_lock(&sdp->sd_quota_mutex);
+	gfs2_assert(sdp, qd->qd_bh_count);
+	if (!--qd->qd_bh_count) {
+		brelse(qd->qd_bh);
+		qd->qd_bh = NULL;
+		qd->qd_bh_qc = NULL;
+	}
+	mutex_unlock(&sdp->sd_quota_mutex);
+}
+
+static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
+{
+	struct gfs2_quota_data *qd = NULL;
+	int error;
+	int found = 0;
+
+	*qdp = NULL;
+
+	if (sdp->sd_vfs->s_flags & MS_RDONLY)
+		return 0;
+
+	spin_lock(&sdp->sd_quota_spin);
+
+	list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
+		if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
+		    !test_bit(QDF_CHANGE, &qd->qd_flags) ||
+		    qd->qd_sync_gen >= sdp->sd_quota_sync_gen)
+			continue;
+
+		list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
+
+		set_bit(QDF_LOCKED, &qd->qd_flags);
+		gfs2_assert_warn(sdp, qd->qd_count);
+		qd->qd_count++;
+		qd->qd_change_sync = qd->qd_change;
+		gfs2_assert_warn(sdp, qd->qd_slot_count);
+		qd->qd_slot_count++;
+		found = 1;
+
+		break;
+	}
+
+	if (!found)
+		qd = NULL;
+
+	spin_unlock(&sdp->sd_quota_spin);
+
+	if (qd) {
+		gfs2_assert_warn(sdp, qd->qd_change_sync);
+		error = bh_get(qd);
+		if (error) {
+			clear_bit(QDF_LOCKED, &qd->qd_flags);
+			slot_put(qd);
+			qd_put(qd);
+			return error;
+		}
+	}
+
+	*qdp = qd;
+
+	return 0;
+}
+
+static int qd_trylock(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+
+	if (sdp->sd_vfs->s_flags & MS_RDONLY)
+		return 0;
+
+	spin_lock(&sdp->sd_quota_spin);
+
+	if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
+	    !test_bit(QDF_CHANGE, &qd->qd_flags)) {
+		spin_unlock(&sdp->sd_quota_spin);
+		return 0;
+	}
+
+	list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
+
+	set_bit(QDF_LOCKED, &qd->qd_flags);
+	gfs2_assert_warn(sdp, qd->qd_count);
+	qd->qd_count++;
+	qd->qd_change_sync = qd->qd_change;
+	gfs2_assert_warn(sdp, qd->qd_slot_count);
+	qd->qd_slot_count++;
+
+	spin_unlock(&sdp->sd_quota_spin);
+
+	gfs2_assert_warn(sdp, qd->qd_change_sync);
+	if (bh_get(qd)) {
+		clear_bit(QDF_LOCKED, &qd->qd_flags);
+		slot_put(qd);
+		qd_put(qd);
+		return 0;
+	}
+
+	return 1;
+}
+
+static void qd_unlock(struct gfs2_quota_data *qd)
+{
+	gfs2_assert_warn(qd->qd_gl->gl_sbd,
+			 test_bit(QDF_LOCKED, &qd->qd_flags));
+	clear_bit(QDF_LOCKED, &qd->qd_flags);
+	bh_put(qd);
+	slot_put(qd);
+	qd_put(qd);
+}
+
+static int qdsb_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
+		    struct gfs2_quota_data **qdp)
+{
+	int error;
+
+	error = qd_get(sdp, user, id, create, qdp);
+	if (error)
+		return error;
+
+	error = slot_get(*qdp);
+	if (error)
+		goto fail;
+
+	error = bh_get(*qdp);
+	if (error)
+		goto fail_slot;
+
+	return 0;
+
+fail_slot:
+	slot_put(*qdp);
+fail:
+	qd_put(*qdp);
+	return error;
+}
+
+static void qdsb_put(struct gfs2_quota_data *qd)
+{
+	bh_put(qd);
+	slot_put(qd);
+	qd_put(qd);
+}
+
+int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_quota_data **qd = al->al_qd;
+	int error;
+
+	if (gfs2_assert_warn(sdp, !al->al_qd_num) ||
+	    gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
+		return -EIO;
+
+	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+		return 0;
+
+	error = qdsb_get(sdp, QUOTA_USER, ip->i_di.di_uid, CREATE, qd);
+	if (error)
+		goto out;
+	al->al_qd_num++;
+	qd++;
+
+	error = qdsb_get(sdp, QUOTA_GROUP, ip->i_di.di_gid, CREATE, qd);
+	if (error)
+		goto out;
+	al->al_qd_num++;
+	qd++;
+
+	if (uid != NO_QUOTA_CHANGE && uid != ip->i_di.di_uid) {
+		error = qdsb_get(sdp, QUOTA_USER, uid, CREATE, qd);
+		if (error)
+			goto out;
+		al->al_qd_num++;
+		qd++;
+	}
+
+	if (gid != NO_QUOTA_CHANGE && gid != ip->i_di.di_gid) {
+		error = qdsb_get(sdp, QUOTA_GROUP, gid, CREATE, qd);
+		if (error)
+			goto out;
+		al->al_qd_num++;
+		qd++;
+	}
+
+out:
+	if (error)
+		gfs2_quota_unhold(ip);
+	return error;
+}
+
+void gfs2_quota_unhold(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al = &ip->i_alloc;
+	unsigned int x;
+
+	gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
+
+	for (x = 0; x < al->al_qd_num; x++) {
+		qdsb_put(al->al_qd[x]);
+		al->al_qd[x] = NULL;
+	}
+	al->al_qd_num = 0;
+}
+
+static int sort_qd(const void *a, const void *b)
+{
+	const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a;
+	const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b;
+
+	if (!test_bit(QDF_USER, &qd_a->qd_flags) !=
+	    !test_bit(QDF_USER, &qd_b->qd_flags)) {
+		if (test_bit(QDF_USER, &qd_a->qd_flags))
+			return -1;
+		else
+			return 1;
+	}
+	if (qd_a->qd_id < qd_b->qd_id)
+		return -1;
+	if (qd_a->qd_id > qd_b->qd_id)
+		return 1;
+
+	return 0;
+}
+
+static void do_qc(struct gfs2_quota_data *qd, s64 change)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+	struct gfs2_quota_change *qc = qd->qd_bh_qc;
+	s64 x;
+
+	mutex_lock(&sdp->sd_quota_mutex);
+	gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1);
+
+	if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {
+		qc->qc_change = 0;
+		qc->qc_flags = 0;
+		if (test_bit(QDF_USER, &qd->qd_flags))
+			qc->qc_flags = cpu_to_be32(GFS2_QCF_USER);
+		qc->qc_id = cpu_to_be32(qd->qd_id);
+	}
+
+	x = qc->qc_change;
+	x = be64_to_cpu(x) + change;
+	qc->qc_change = cpu_to_be64(x);
+
+	spin_lock(&sdp->sd_quota_spin);
+	qd->qd_change = x;
+	spin_unlock(&sdp->sd_quota_spin);
+
+	if (!x) {
+		gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
+		clear_bit(QDF_CHANGE, &qd->qd_flags);
+		qc->qc_flags = 0;
+		qc->qc_id = 0;
+		slot_put(qd);
+		qd_put(qd);
+	} else if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) {
+		qd_hold(qd);
+		slot_hold(qd);
+	}
+
+	mutex_unlock(&sdp->sd_quota_mutex);
+}
+
+/**
+ * gfs2_adjust_quota
+ *
+ * This function was mostly borrowed from gfs2_block_truncate_page which was
+ * in turn mostly borrowed from ext3
+ */
+static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
+			     s64 change, struct gfs2_quota_data *qd)
+{
+	struct inode *inode = &ip->i_inode;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long index = loc >> PAGE_CACHE_SHIFT;
+	unsigned offset = loc & (PAGE_CACHE_SHIFT - 1);
+	unsigned blocksize, iblock, pos;
+	struct buffer_head *bh;
+	struct page *page;
+	void *kaddr;
+	__be64 *ptr;
+	s64 value;
+	int err = -EIO;
+
+	page = grab_cache_page(mapping, index);
+	if (!page)
+		return -ENOMEM;
+
+	blocksize = inode->i_sb->s_blocksize;
+	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+
+	bh = page_buffers(page);
+	pos = blocksize;
+	while (offset >= pos) {
+		bh = bh->b_this_page;
+		iblock++;
+		pos += blocksize;
+	}
+
+	if (!buffer_mapped(bh)) {
+		gfs2_get_block(inode, iblock, bh, 1);
+		if (!buffer_mapped(bh))
+			goto unlock;
+	}
+
+	if (PageUptodate(page))
+		set_buffer_uptodate(bh);
+
+	if (!buffer_uptodate(bh)) {
+		ll_rw_block(READ_META, 1, &bh);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh))
+			goto unlock;
+	}
+
+	gfs2_trans_add_bh(ip->i_gl, bh, 0);
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	ptr = kaddr + offset;
+	value = (s64)be64_to_cpu(*ptr) + change;
+	*ptr = cpu_to_be64(value);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+	err = 0;
+	qd->qd_qb.qb_magic = cpu_to_be32(GFS2_MAGIC);
+	qd->qd_qb.qb_value = cpu_to_be64(value);
+unlock:
+	unlock_page(page);
+	page_cache_release(page);
+	return err;
+}
+
+static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
+{
+	struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+	unsigned int data_blocks, ind_blocks;
+	struct gfs2_holder *ghs, i_gh;
+	unsigned int qx, x;
+	struct gfs2_quota_data *qd;
+	loff_t offset;
+	unsigned int nalloc = 0;
+	struct gfs2_alloc *al = NULL;
+	int error;
+
+	gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
+			      &data_blocks, &ind_blocks);
+
+	ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL);
+	if (!ghs)
+		return -ENOMEM;
+
+	sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
+	for (qx = 0; qx < num_qd; qx++) {
+		error = gfs2_glock_nq_init(qda[qx]->qd_gl,
+					   LM_ST_EXCLUSIVE,
+					   GL_NOCACHE, &ghs[qx]);
+		if (error)
+			goto out;
+	}
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+	if (error)
+		goto out;
+
+	for (x = 0; x < num_qd; x++) {
+		int alloc_required;
+
+		offset = qd2offset(qda[x]);
+		error = gfs2_write_alloc_required(ip, offset,
+						  sizeof(struct gfs2_quota),
+						  &alloc_required);
+		if (error)
+			goto out_gunlock;
+		if (alloc_required)
+			nalloc++;
+	}
+
+	if (nalloc) {
+		al = gfs2_alloc_get(ip);
+
+		al->al_requested = nalloc * (data_blocks + ind_blocks);
+
+		error = gfs2_inplace_reserve(ip);
+		if (error)
+			goto out_alloc;
+
+		error = gfs2_trans_begin(sdp,
+					 al->al_rgd->rd_ri.ri_length +
+					 num_qd * data_blocks +
+					 nalloc * ind_blocks +
+					 RES_DINODE + num_qd +
+					 RES_STATFS, 0);
+		if (error)
+			goto out_ipres;
+	} else {
+		error = gfs2_trans_begin(sdp,
+					 num_qd * data_blocks +
+					 RES_DINODE + num_qd, 0);
+		if (error)
+			goto out_gunlock;
+	}
+
+	for (x = 0; x < num_qd; x++) {
+		qd = qda[x];
+		offset = qd2offset(qd);
+		error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync,
+					  (struct gfs2_quota_data *)
+					  qd->qd_gl->gl_lvb);
+		if (error)
+			goto out_end_trans;
+
+		do_qc(qd, -qd->qd_change_sync);
+	}
+
+	error = 0;
+
+out_end_trans:
+	gfs2_trans_end(sdp);
+out_ipres:
+	if (nalloc)
+		gfs2_inplace_release(ip);
+out_alloc:
+	if (nalloc)
+		gfs2_alloc_put(ip);
+out_gunlock:
+	gfs2_glock_dq_uninit(&i_gh);
+out:
+	while (qx--)
+		gfs2_glock_dq_uninit(&ghs[qx]);
+	kfree(ghs);
+	gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
+	return error;
+}
+
+static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
+		    struct gfs2_holder *q_gh)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+	struct gfs2_holder i_gh;
+	struct gfs2_quota q;
+	char buf[sizeof(struct gfs2_quota)];
+	struct file_ra_state ra_state;
+	int error;
+	struct gfs2_quota_lvb *qlvb;
+
+	file_ra_state_init(&ra_state, sdp->sd_quota_inode->i_mapping);
+restart:
+	error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
+	if (error)
+		return error;
+
+	qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+
+	if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
+		loff_t pos;
+		gfs2_glock_dq_uninit(q_gh);
+		error = gfs2_glock_nq_init(qd->qd_gl,
+					  LM_ST_EXCLUSIVE, GL_NOCACHE,
+					  q_gh);
+		if (error)
+			return error;
+
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+		if (error)
+			goto fail;
+
+		memset(buf, 0, sizeof(struct gfs2_quota));
+		pos = qd2offset(qd);
+		error = gfs2_internal_read(ip, &ra_state, buf,
+					   &pos, sizeof(struct gfs2_quota));
+		if (error < 0)
+			goto fail_gunlock;
+
+		gfs2_glock_dq_uninit(&i_gh);
+
+
+		gfs2_quota_in(&q, buf);
+		qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+		qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
+		qlvb->__pad = 0;
+		qlvb->qb_limit = cpu_to_be64(q.qu_limit);
+		qlvb->qb_warn = cpu_to_be64(q.qu_warn);
+		qlvb->qb_value = cpu_to_be64(q.qu_value);
+		qd->qd_qb = *qlvb;
+
+		if (gfs2_glock_is_blocking(qd->qd_gl)) {
+			gfs2_glock_dq_uninit(q_gh);
+			force_refresh = 0;
+			goto restart;
+		}
+	}
+
+	return 0;
+
+fail_gunlock:
+	gfs2_glock_dq_uninit(&i_gh);
+fail:
+	gfs2_glock_dq_uninit(q_gh);
+	return error;
+}
+
+int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al = &ip->i_alloc;
+	unsigned int x;
+	int error = 0;
+
+	gfs2_quota_hold(ip, uid, gid);
+
+	if (capable(CAP_SYS_RESOURCE) ||
+	    sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+		return 0;
+
+	sort(al->al_qd, al->al_qd_num, sizeof(struct gfs2_quota_data *),
+	     sort_qd, NULL);
+
+	for (x = 0; x < al->al_qd_num; x++) {
+		error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]);
+		if (error)
+			break;
+	}
+
+	if (!error)
+		set_bit(GIF_QD_LOCKED, &ip->i_flags);
+	else {
+		while (x--)
+			gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
+		gfs2_quota_unhold(ip);
+	}
+
+	return error;
+}
+
+static int need_sync(struct gfs2_quota_data *qd)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	struct gfs2_tune *gt = &sdp->sd_tune;
+	s64 value;
+	unsigned int num, den;
+	int do_sync = 1;
+
+	if (!qd->qd_qb.qb_limit)
+		return 0;
+
+	spin_lock(&sdp->sd_quota_spin);
+	value = qd->qd_change;
+	spin_unlock(&sdp->sd_quota_spin);
+
+	spin_lock(&gt->gt_spin);
+	num = gt->gt_quota_scale_num;
+	den = gt->gt_quota_scale_den;
+	spin_unlock(&gt->gt_spin);
+
+	if (value < 0)
+		do_sync = 0;
+	else if ((s64)be64_to_cpu(qd->qd_qb.qb_value) >=
+		 (s64)be64_to_cpu(qd->qd_qb.qb_limit))
+		do_sync = 0;
+	else {
+		value *= gfs2_jindex_size(sdp) * num;
+		do_div(value, den);
+		value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
+		if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
+			do_sync = 0;
+	}
+
+	return do_sync;
+}
+
+void gfs2_quota_unlock(struct gfs2_inode *ip)
+{
+	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_quota_data *qda[4];
+	unsigned int count = 0;
+	unsigned int x;
+
+	if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
+		goto out;
+
+	for (x = 0; x < al->al_qd_num; x++) {
+		struct gfs2_quota_data *qd;
+		int sync;
+
+		qd = al->al_qd[x];
+		sync = need_sync(qd);
+
+		gfs2_glock_dq_uninit(&al->al_qd_ghs[x]);
+
+		if (sync && qd_trylock(qd))
+			qda[count++] = qd;
+	}
+
+	if (count) {
+		do_sync(count, qda);
+		for (x = 0; x < count; x++)
+			qd_unlock(qda[x]);
+	}
+
+out:
+	gfs2_quota_unhold(ip);
+}
+
+#define MAX_LINE 256
+
+static int print_message(struct gfs2_quota_data *qd, char *type)
+{
+	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+
+	printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\r\n",
+	       sdp->sd_fsname, type,
+	       (test_bit(QDF_USER, &qd->qd_flags)) ? "user" : "group",
+	       qd->qd_id);
+
+	return 0;
+}
+
+int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_quota_data *qd;
+	s64 value;
+	unsigned int x;
+	int error = 0;
+
+	if (!test_bit(GIF_QD_LOCKED, &ip->i_flags))
+		return 0;
+
+        if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+                return 0;
+
+	for (x = 0; x < al->al_qd_num; x++) {
+		qd = al->al_qd[x];
+
+		if (!((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
+		      (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))))
+			continue;
+
+		value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
+		spin_lock(&sdp->sd_quota_spin);
+		value += qd->qd_change;
+		spin_unlock(&sdp->sd_quota_spin);
+
+		if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
+			print_message(qd, "exceeded");
+			error = -EDQUOT;
+			break;
+		} else if (be64_to_cpu(qd->qd_qb.qb_warn) &&
+			   (s64)be64_to_cpu(qd->qd_qb.qb_warn) < value &&
+			   time_after_eq(jiffies, qd->qd_last_warn +
+					 gfs2_tune_get(sdp,
+						gt_quota_warn_period) * HZ)) {
+			error = print_message(qd, "warning");
+			qd->qd_last_warn = jiffies;
+		}
+	}
+
+	return error;
+}
+
+void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+		       u32 uid, u32 gid)
+{
+	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_quota_data *qd;
+	unsigned int x;
+	unsigned int found = 0;
+
+	if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
+		return;
+	if (ip->i_di.di_flags & GFS2_DIF_SYSTEM)
+		return;
+
+	for (x = 0; x < al->al_qd_num; x++) {
+		qd = al->al_qd[x];
+
+		if ((qd->qd_id == uid && test_bit(QDF_USER, &qd->qd_flags)) ||
+		    (qd->qd_id == gid && !test_bit(QDF_USER, &qd->qd_flags))) {
+			do_qc(qd, change);
+			found++;
+		}
+	}
+}
+
+int gfs2_quota_sync(struct gfs2_sbd *sdp)
+{
+	struct gfs2_quota_data **qda;
+	unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
+	unsigned int num_qd;
+	unsigned int x;
+	int error = 0;
+
+	sdp->sd_quota_sync_gen++;
+
+	qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL);
+	if (!qda)
+		return -ENOMEM;
+
+	do {
+		num_qd = 0;
+
+		for (;;) {
+			error = qd_fish(sdp, qda + num_qd);
+			if (error || !qda[num_qd])
+				break;
+			if (++num_qd == max_qd)
+				break;
+		}
+
+		if (num_qd) {
+			if (!error)
+				error = do_sync(num_qd, qda);
+			if (!error)
+				for (x = 0; x < num_qd; x++)
+					qda[x]->qd_sync_gen =
+						sdp->sd_quota_sync_gen;
+
+			for (x = 0; x < num_qd; x++)
+				qd_unlock(qda[x]);
+		}
+	} while (!error && num_qd == max_qd);
+
+	kfree(qda);
+
+	return error;
+}
+
+int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id)
+{
+	struct gfs2_quota_data *qd;
+	struct gfs2_holder q_gh;
+	int error;
+
+	error = qd_get(sdp, user, id, CREATE, &qd);
+	if (error)
+		return error;
+
+	error = do_glock(qd, FORCE, &q_gh);
+	if (!error)
+		gfs2_glock_dq_uninit(&q_gh);
+
+	qd_put(qd);
+
+	return error;
+}
+
+int gfs2_quota_init(struct gfs2_sbd *sdp)
+{
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
+	unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+	unsigned int x, slot = 0;
+	unsigned int found = 0;
+	u64 dblock;
+	u32 extlen = 0;
+	int error;
+
+	if (!ip->i_di.di_size || ip->i_di.di_size > (64 << 20) ||
+	    ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+	sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
+	sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
+
+	error = -ENOMEM;
+
+	sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
+				       sizeof(unsigned char *), GFP_KERNEL);
+	if (!sdp->sd_quota_bitmap)
+		return error;
+
+	for (x = 0; x < sdp->sd_quota_chunks; x++) {
+		sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL);
+		if (!sdp->sd_quota_bitmap[x])
+			goto fail;
+	}
+
+	for (x = 0; x < blocks; x++) {
+		struct buffer_head *bh;
+		unsigned int y;
+
+		if (!extlen) {
+			int new = 0;
+			error = gfs2_extent_map(&ip->i_inode, x, &new, &dblock, &extlen);
+			if (error)
+				goto fail;
+		}
+		error = -EIO;
+		bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
+		if (!bh)
+			goto fail;
+		if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) {
+			brelse(bh);
+			goto fail;
+		}
+
+		for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
+		     y++, slot++) {
+			struct gfs2_quota_change qc;
+			struct gfs2_quota_data *qd;
+
+			gfs2_quota_change_in(&qc, bh->b_data +
+					  sizeof(struct gfs2_meta_header) +
+					  y * sizeof(struct gfs2_quota_change));
+			if (!qc.qc_change)
+				continue;
+
+			error = qd_alloc(sdp, (qc.qc_flags & GFS2_QCF_USER),
+					 qc.qc_id, &qd);
+			if (error) {
+				brelse(bh);
+				goto fail;
+			}
+
+			set_bit(QDF_CHANGE, &qd->qd_flags);
+			qd->qd_change = qc.qc_change;
+			qd->qd_slot = slot;
+			qd->qd_slot_count = 1;
+			qd->qd_last_touched = jiffies;
+
+			spin_lock(&sdp->sd_quota_spin);
+			gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
+			list_add(&qd->qd_list, &sdp->sd_quota_list);
+			atomic_inc(&sdp->sd_quota_count);
+			spin_unlock(&sdp->sd_quota_spin);
+
+			found++;
+		}
+
+		brelse(bh);
+		dblock++;
+		extlen--;
+	}
+
+	if (found)
+		fs_info(sdp, "found %u quota changes\n", found);
+
+	return 0;
+
+fail:
+	gfs2_quota_cleanup(sdp);
+	return error;
+}
+
+void gfs2_quota_scan(struct gfs2_sbd *sdp)
+{
+	struct gfs2_quota_data *qd, *safe;
+	LIST_HEAD(dead);
+
+	spin_lock(&sdp->sd_quota_spin);
+	list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) {
+		if (!qd->qd_count &&
+		    time_after_eq(jiffies, qd->qd_last_touched +
+			        gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) {
+			list_move(&qd->qd_list, &dead);
+			gfs2_assert_warn(sdp,
+					 atomic_read(&sdp->sd_quota_count) > 0);
+			atomic_dec(&sdp->sd_quota_count);
+		}
+	}
+	spin_unlock(&sdp->sd_quota_spin);
+
+	while (!list_empty(&dead)) {
+		qd = list_entry(dead.next, struct gfs2_quota_data, qd_list);
+		list_del(&qd->qd_list);
+
+		gfs2_assert_warn(sdp, !qd->qd_change);
+		gfs2_assert_warn(sdp, !qd->qd_slot_count);
+		gfs2_assert_warn(sdp, !qd->qd_bh_count);
+
+		gfs2_lvb_unhold(qd->qd_gl);
+		kfree(qd);
+	}
+}
+
+void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
+{
+	struct list_head *head = &sdp->sd_quota_list;
+	struct gfs2_quota_data *qd;
+	unsigned int x;
+
+	spin_lock(&sdp->sd_quota_spin);
+	while (!list_empty(head)) {
+		qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
+
+		if (qd->qd_count > 1 ||
+		    (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
+			list_move(&qd->qd_list, head);
+			spin_unlock(&sdp->sd_quota_spin);
+			schedule();
+			spin_lock(&sdp->sd_quota_spin);
+			continue;
+		}
+
+		list_del(&qd->qd_list);
+		atomic_dec(&sdp->sd_quota_count);
+		spin_unlock(&sdp->sd_quota_spin);
+
+		if (!qd->qd_count) {
+			gfs2_assert_warn(sdp, !qd->qd_change);
+			gfs2_assert_warn(sdp, !qd->qd_slot_count);
+		} else
+			gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
+		gfs2_assert_warn(sdp, !qd->qd_bh_count);
+
+		gfs2_lvb_unhold(qd->qd_gl);
+		kfree(qd);
+
+		spin_lock(&sdp->sd_quota_spin);
+	}
+	spin_unlock(&sdp->sd_quota_spin);
+
+	gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
+
+	if (sdp->sd_quota_bitmap) {
+		for (x = 0; x < sdp->sd_quota_chunks; x++)
+			kfree(sdp->sd_quota_bitmap[x]);
+		kfree(sdp->sd_quota_bitmap);
+	}
+}
+
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
new file mode 100644
index 000000000000..a8be1417051f
--- /dev/null
+++ b/fs/gfs2/quota.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __QUOTA_DOT_H__
+#define __QUOTA_DOT_H__
+
+struct gfs2_inode;
+struct gfs2_sbd;
+
+#define NO_QUOTA_CHANGE ((u32)-1)
+
+int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
+void gfs2_quota_unhold(struct gfs2_inode *ip);
+
+int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
+void gfs2_quota_unlock(struct gfs2_inode *ip);
+
+int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
+void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+		       u32 uid, u32 gid);
+
+int gfs2_quota_sync(struct gfs2_sbd *sdp);
+int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
+
+int gfs2_quota_init(struct gfs2_sbd *sdp);
+void gfs2_quota_scan(struct gfs2_sbd *sdp);
+void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+
+#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
new file mode 100644
index 000000000000..62cd223819b7
--- /dev/null
+++ b/fs/gfs2/recovery.c
@@ -0,0 +1,571 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "glops.h"
+#include "lm.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "recovery.h"
+#include "super.h"
+#include "util.h"
+#include "dir.h"
+
+int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+			   struct buffer_head **bh)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_glock *gl = ip->i_gl;
+	int new = 0;
+	u64 dblock;
+	u32 extlen;
+	int error;
+
+	error = gfs2_extent_map(&ip->i_inode, blk, &new, &dblock, &extlen);
+	if (error)
+		return error;
+	if (!dblock) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	*bh = gfs2_meta_ra(gl, dblock, extlen);
+
+	return error;
+}
+
+int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
+{
+	struct list_head *head = &sdp->sd_revoke_list;
+	struct gfs2_revoke_replay *rr;
+	int found = 0;
+
+	list_for_each_entry(rr, head, rr_list) {
+		if (rr->rr_blkno == blkno) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (found) {
+		rr->rr_where = where;
+		return 0;
+	}
+
+	rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL);
+	if (!rr)
+		return -ENOMEM;
+
+	rr->rr_blkno = blkno;
+	rr->rr_where = where;
+	list_add(&rr->rr_list, head);
+
+	return 1;
+}
+
+int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
+{
+	struct gfs2_revoke_replay *rr;
+	int wrap, a, b, revoke;
+	int found = 0;
+
+	list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) {
+		if (rr->rr_blkno == blkno) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found)
+		return 0;
+
+	wrap = (rr->rr_where < sdp->sd_replay_tail);
+	a = (sdp->sd_replay_tail < where);
+	b = (where < rr->rr_where);
+	revoke = (wrap) ? (a || b) : (a && b);
+
+	return revoke;
+}
+
+void gfs2_revoke_clean(struct gfs2_sbd *sdp)
+{
+	struct list_head *head = &sdp->sd_revoke_list;
+	struct gfs2_revoke_replay *rr;
+
+	while (!list_empty(head)) {
+		rr = list_entry(head->next, struct gfs2_revoke_replay, rr_list);
+		list_del(&rr->rr_list);
+		kfree(rr);
+	}
+}
+
+/**
+ * get_log_header - read the log header for a given segment
+ * @jd: the journal
+ * @blk: the block to look at
+ * @lh: the log header to return
+ *
+ * Read the log header for a given segement in a given journal.  Do a few
+ * sanity checks on it.
+ *
+ * Returns: 0 on success,
+ *          1 if the header was invalid or incomplete,
+ *          errno on error
+ */
+
+static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
+			  struct gfs2_log_header *head)
+{
+	struct buffer_head *bh;
+	struct gfs2_log_header lh;
+	u32 hash;
+	int error;
+
+	error = gfs2_replay_read_block(jd, blk, &bh);
+	if (error)
+		return error;
+
+	memcpy(&lh, bh->b_data, sizeof(struct gfs2_log_header));
+	lh.lh_hash = 0;
+	hash = gfs2_disk_hash((char *)&lh, sizeof(struct gfs2_log_header));
+	gfs2_log_header_in(&lh, bh->b_data);
+
+	brelse(bh);
+
+	if (lh.lh_header.mh_magic != GFS2_MAGIC ||
+	    lh.lh_header.mh_type != GFS2_METATYPE_LH ||
+	    lh.lh_blkno != blk || lh.lh_hash != hash)
+		return 1;
+
+	*head = lh;
+
+	return 0;
+}
+
+/**
+ * find_good_lh - find a good log header
+ * @jd: the journal
+ * @blk: the segment to start searching from
+ * @lh: the log header to fill in
+ * @forward: if true search forward in the log, else search backward
+ *
+ * Call get_log_header() to get a log header for a segment, but if the
+ * segment is bad, either scan forward or backward until we find a good one.
+ *
+ * Returns: errno
+ */
+
+static int find_good_lh(struct gfs2_jdesc *jd, unsigned int *blk,
+			struct gfs2_log_header *head)
+{
+	unsigned int orig_blk = *blk;
+	int error;
+
+	for (;;) {
+		error = get_log_header(jd, *blk, head);
+		if (error <= 0)
+			return error;
+
+		if (++*blk == jd->jd_blocks)
+			*blk = 0;
+
+		if (*blk == orig_blk) {
+			gfs2_consist_inode(GFS2_I(jd->jd_inode));
+			return -EIO;
+		}
+	}
+}
+
+/**
+ * jhead_scan - make sure we've found the head of the log
+ * @jd: the journal
+ * @head: this is filled in with the log descriptor of the head
+ *
+ * At this point, seg and lh should be either the head of the log or just
+ * before.  Scan forward until we find the head.
+ *
+ * Returns: errno
+ */
+
+static int jhead_scan(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
+{
+	unsigned int blk = head->lh_blkno;
+	struct gfs2_log_header lh;
+	int error;
+
+	for (;;) {
+		if (++blk == jd->jd_blocks)
+			blk = 0;
+
+		error = get_log_header(jd, blk, &lh);
+		if (error < 0)
+			return error;
+		if (error == 1)
+			continue;
+
+		if (lh.lh_sequence == head->lh_sequence) {
+			gfs2_consist_inode(GFS2_I(jd->jd_inode));
+			return -EIO;
+		}
+		if (lh.lh_sequence < head->lh_sequence)
+			break;
+
+		*head = lh;
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_find_jhead - find the head of a log
+ * @jd: the journal
+ * @head: the log descriptor for the head of the log is returned here
+ *
+ * Do a binary search of a journal and find the valid log entry with the
+ * highest sequence number.  (i.e. the log head)
+ *
+ * Returns: errno
+ */
+
+int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
+{
+	struct gfs2_log_header lh_1, lh_m;
+	u32 blk_1, blk_2, blk_m;
+	int error;
+
+	blk_1 = 0;
+	blk_2 = jd->jd_blocks - 1;
+
+	for (;;) {
+		blk_m = (blk_1 + blk_2) / 2;
+
+		error = find_good_lh(jd, &blk_1, &lh_1);
+		if (error)
+			return error;
+
+		error = find_good_lh(jd, &blk_m, &lh_m);
+		if (error)
+			return error;
+
+		if (blk_1 == blk_m || blk_m == blk_2)
+			break;
+
+		if (lh_1.lh_sequence <= lh_m.lh_sequence)
+			blk_1 = blk_m;
+		else
+			blk_2 = blk_m;
+	}
+
+	error = jhead_scan(jd, &lh_1);
+	if (error)
+		return error;
+
+	*head = lh_1;
+
+	return error;
+}
+
+/**
+ * foreach_descriptor - go through the active part of the log
+ * @jd: the journal
+ * @start: the first log header in the active region
+ * @end: the last log header (don't process the contents of this entry))
+ *
+ * Call a given function once for every log descriptor in the active
+ * portion of the log.
+ *
+ * Returns: errno
+ */
+
+static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start,
+			      unsigned int end, int pass)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	struct buffer_head *bh;
+	struct gfs2_log_descriptor *ld;
+	int error = 0;
+	u32 length;
+	__be64 *ptr;
+	unsigned int offset = sizeof(struct gfs2_log_descriptor);
+	offset += sizeof(__be64) - 1;
+	offset &= ~(sizeof(__be64) - 1);
+
+	while (start != end) {
+		error = gfs2_replay_read_block(jd, start, &bh);
+		if (error)
+			return error;
+		if (gfs2_meta_check(sdp, bh)) {
+			brelse(bh);
+			return -EIO;
+		}
+		ld = (struct gfs2_log_descriptor *)bh->b_data;
+		length = be32_to_cpu(ld->ld_length);
+
+		if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) {
+			struct gfs2_log_header lh;
+			error = get_log_header(jd, start, &lh);
+			if (!error) {
+				gfs2_replay_incr_blk(sdp, &start);
+				brelse(bh);
+				continue;
+			}
+			if (error == 1) {
+				gfs2_consist_inode(GFS2_I(jd->jd_inode));
+				error = -EIO;
+			}
+			brelse(bh);
+			return error;
+		} else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) {
+			brelse(bh);
+			return -EIO;
+		}
+		ptr = (__be64 *)(bh->b_data + offset);
+		error = lops_scan_elements(jd, start, ld, ptr, pass);
+		if (error) {
+			brelse(bh);
+			return error;
+		}
+
+		while (length--)
+			gfs2_replay_incr_blk(sdp, &start);
+
+		brelse(bh);
+	}
+
+	return 0;
+}
+
+/**
+ * clean_journal - mark a dirty journal as being clean
+ * @sdp: the filesystem
+ * @jd: the journal
+ * @gl: the journal's glock
+ * @head: the head journal to start from
+ *
+ * Returns: errno
+ */
+
+static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header *head)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	unsigned int lblock;
+	struct gfs2_log_header *lh;
+	u32 hash;
+	struct buffer_head *bh;
+	int error;
+	struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
+
+	lblock = head->lh_blkno;
+	gfs2_replay_incr_blk(sdp, &lblock);
+	bh_map.b_size = 1 << ip->i_inode.i_blkbits;
+	error = gfs2_block_map(&ip->i_inode, lblock, 0, &bh_map);
+	if (error)
+		return error;
+	if (!bh_map.b_blocknr) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	bh = sb_getblk(sdp->sd_vfs, bh_map.b_blocknr);
+	lock_buffer(bh);
+	memset(bh->b_data, 0, bh->b_size);
+	set_buffer_uptodate(bh);
+	clear_buffer_dirty(bh);
+	unlock_buffer(bh);
+
+	lh = (struct gfs2_log_header *)bh->b_data;
+	memset(lh, 0, sizeof(struct gfs2_log_header));
+	lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+	lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
+	lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
+	lh->lh_sequence = cpu_to_be64(head->lh_sequence + 1);
+	lh->lh_flags = cpu_to_be32(GFS2_LOG_HEAD_UNMOUNT);
+	lh->lh_blkno = cpu_to_be32(lblock);
+	hash = gfs2_disk_hash((const char *)lh, sizeof(struct gfs2_log_header));
+	lh->lh_hash = cpu_to_be32(hash);
+
+	set_buffer_dirty(bh);
+	if (sync_dirty_buffer(bh))
+		gfs2_io_error_bh(sdp, bh);
+	brelse(bh);
+
+	return error;
+}
+
+/**
+ * gfs2_recover_journal - recovery a given journal
+ * @jd: the struct gfs2_jdesc describing the journal
+ *
+ * Acquire the journal's lock, check to see if the journal is clean, and
+ * do recovery if necessary.
+ *
+ * Returns: errno
+ */
+
+int gfs2_recover_journal(struct gfs2_jdesc *jd)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	struct gfs2_log_header head;
+	struct gfs2_holder j_gh, ji_gh, t_gh;
+	unsigned long t;
+	int ro = 0;
+	unsigned int pass;
+	int error;
+
+	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+		fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
+			jd->jd_jid);
+
+		/* Aquire the journal lock so we can do recovery */
+
+		error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
+					  LM_ST_EXCLUSIVE,
+					  LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE,
+					  &j_gh);
+		switch (error) {
+		case 0:
+			break;
+
+		case GLR_TRYFAILED:
+			fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid);
+			error = 0;
+
+		default:
+			goto fail;
+		};
+
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
+					   LM_FLAG_NOEXP, &ji_gh);
+		if (error)
+			goto fail_gunlock_j;
+	} else {
+		fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid);
+	}
+
+	fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid);
+
+	error = gfs2_jdesc_check(jd);
+	if (error)
+		goto fail_gunlock_ji;
+
+	error = gfs2_find_jhead(jd, &head);
+	if (error)
+		goto fail_gunlock_ji;
+
+	if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+		fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n",
+			jd->jd_jid);
+
+		t = jiffies;
+
+		/* Acquire a shared hold on the transaction lock */
+
+		error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
+					   LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
+					   GL_NOCANCEL | GL_NOCACHE, &t_gh);
+		if (error)
+			goto fail_gunlock_ji;
+
+		if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
+			if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+				ro = 1;
+		} else {
+			if (sdp->sd_vfs->s_flags & MS_RDONLY)
+				ro = 1;
+		}
+
+		if (ro) {
+			fs_warn(sdp, "jid=%u: Can't replay: read-only FS\n",
+				jd->jd_jid);
+			error = -EROFS;
+			goto fail_gunlock_tr;
+		}
+
+		fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid);
+
+		for (pass = 0; pass < 2; pass++) {
+			lops_before_scan(jd, &head, pass);
+			error = foreach_descriptor(jd, head.lh_tail,
+						   head.lh_blkno, pass);
+			lops_after_scan(jd, error, pass);
+			if (error)
+				goto fail_gunlock_tr;
+		}
+
+		error = clean_journal(jd, &head);
+		if (error)
+			goto fail_gunlock_tr;
+
+		gfs2_glock_dq_uninit(&t_gh);
+		t = DIV_ROUND_UP(jiffies - t, HZ);
+		fs_info(sdp, "jid=%u: Journal replayed in %lus\n",
+			jd->jd_jid, t);
+	}
+
+	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
+		gfs2_glock_dq_uninit(&ji_gh);
+
+	gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
+
+	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
+		gfs2_glock_dq_uninit(&j_gh);
+
+	fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
+	return 0;
+
+fail_gunlock_tr:
+	gfs2_glock_dq_uninit(&t_gh);
+fail_gunlock_ji:
+	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) {
+		gfs2_glock_dq_uninit(&ji_gh);
+fail_gunlock_j:
+		gfs2_glock_dq_uninit(&j_gh);
+	}
+
+	fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
+
+fail:
+	gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
+	return error;
+}
+
+/**
+ * gfs2_check_journals - Recover any dirty journals
+ * @sdp: the filesystem
+ *
+ */
+
+void gfs2_check_journals(struct gfs2_sbd *sdp)
+{
+	struct gfs2_jdesc *jd;
+
+	for (;;) {
+		jd = gfs2_jdesc_find_dirty(sdp);
+		if (!jd)
+			break;
+
+		if (jd != sdp->sd_jdesc)
+			gfs2_recover_journal(jd);
+	}
+}
+
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
new file mode 100644
index 000000000000..961feedf4d8b
--- /dev/null
+++ b/fs/gfs2/recovery.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __RECOVERY_DOT_H__
+#define __RECOVERY_DOT_H__
+
+#include "incore.h"
+
+static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
+{
+	if (++*blk == sdp->sd_jdesc->jd_blocks)
+	        *blk = 0;
+}
+
+int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+			   struct buffer_head **bh);
+
+int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+void gfs2_revoke_clean(struct gfs2_sbd *sdp);
+
+int gfs2_find_jhead(struct gfs2_jdesc *jd,
+		    struct gfs2_log_header *head);
+int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
+void gfs2_check_journals(struct gfs2_sbd *sdp);
+
+#endif /* __RECOVERY_DOT_H__ */
+
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
new file mode 100644
index 000000000000..b261385c0065
--- /dev/null
+++ b/fs/gfs2/rgrp.c
@@ -0,0 +1,1513 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "glops.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "super.h"
+#include "trans.h"
+#include "ops_file.h"
+#include "util.h"
+
+#define BFITNOENT ((u32)~0)
+
+/*
+ * These routines are used by the resource group routines (rgrp.c)
+ * to keep track of block allocation.  Each block is represented by two
+ * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
+ *
+ * 0 = Free
+ * 1 = Used (not metadata)
+ * 2 = Unlinked (still in use) inode
+ * 3 = Used (metadata)
+ */
+
+static const char valid_change[16] = {
+	        /* current */
+	/* n */ 0, 1, 1, 1,
+	/* e */ 1, 0, 0, 0,
+	/* w */ 0, 0, 0, 1,
+	        1, 0, 0, 0
+};
+
+/**
+ * gfs2_setbit - Set a bit in the bitmaps
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @block: the block to set
+ * @new_state: the new state of the block
+ *
+ */
+
+static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+			unsigned int buflen, u32 block,
+			unsigned char new_state)
+{
+	unsigned char *byte, *end, cur_state;
+	unsigned int bit;
+
+	byte = buffer + (block / GFS2_NBBY);
+	bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
+	end = buffer + buflen;
+
+	gfs2_assert(rgd->rd_sbd, byte < end);
+
+	cur_state = (*byte >> bit) & GFS2_BIT_MASK;
+
+	if (valid_change[new_state * 4 + cur_state]) {
+		*byte ^= cur_state << bit;
+		*byte |= new_state << bit;
+	} else
+		gfs2_consist_rgrpd(rgd);
+}
+
+/**
+ * gfs2_testbit - test a bit in the bitmaps
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @block: the block to read
+ *
+ */
+
+static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+				  unsigned int buflen, u32 block)
+{
+	unsigned char *byte, *end, cur_state;
+	unsigned int bit;
+
+	byte = buffer + (block / GFS2_NBBY);
+	bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
+	end = buffer + buflen;
+
+	gfs2_assert(rgd->rd_sbd, byte < end);
+
+	cur_state = (*byte >> bit) & GFS2_BIT_MASK;
+
+	return cur_state;
+}
+
+/**
+ * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
+ *       a block in a given allocation state.
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @goal: start search at this block's bit-pair (within @buffer)
+ * @old_state: GFS2_BLKST_XXX the state of the block we're looking for;
+ *       bit 0 = alloc(1)/free(0), bit 1 = meta(1)/data(0)
+ *
+ * Scope of @goal and returned block number is only within this bitmap buffer,
+ * not entire rgrp or filesystem.  @buffer will be offset from the actual
+ * beginning of a bitmap block buffer, skipping any header structures.
+ *
+ * Return: the block number (bitmap buffer scope) that was found
+ */
+
+static u32 gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+			    unsigned int buflen, u32 goal,
+			    unsigned char old_state)
+{
+	unsigned char *byte, *end, alloc;
+	u32 blk = goal;
+	unsigned int bit;
+
+	byte = buffer + (goal / GFS2_NBBY);
+	bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
+	end = buffer + buflen;
+	alloc = (old_state & 1) ? 0 : 0x55;
+
+	while (byte < end) {
+		if ((*byte & 0x55) == alloc) {
+			blk += (8 - bit) >> 1;
+
+			bit = 0;
+			byte++;
+
+			continue;
+		}
+
+		if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
+			return blk;
+
+		bit += GFS2_BIT_SIZE;
+		if (bit >= 8) {
+			bit = 0;
+			byte++;
+		}
+
+		blk++;
+	}
+
+	return BFITNOENT;
+}
+
+/**
+ * gfs2_bitcount - count the number of bits in a certain state
+ * @buffer: the buffer that holds the bitmaps
+ * @buflen: the length (in bytes) of the buffer
+ * @state: the state of the block we're looking for
+ *
+ * Returns: The number of bits
+ */
+
+static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer,
+			      unsigned int buflen, unsigned char state)
+{
+	unsigned char *byte = buffer;
+	unsigned char *end = buffer + buflen;
+	unsigned char state1 = state << 2;
+	unsigned char state2 = state << 4;
+	unsigned char state3 = state << 6;
+	u32 count = 0;
+
+	for (; byte < end; byte++) {
+		if (((*byte) & 0x03) == state)
+			count++;
+		if (((*byte) & 0x0C) == state1)
+			count++;
+		if (((*byte) & 0x30) == state2)
+			count++;
+		if (((*byte) & 0xC0) == state3)
+			count++;
+	}
+
+	return count;
+}
+
+/**
+ * gfs2_rgrp_verify - Verify that a resource group is consistent
+ * @sdp: the filesystem
+ * @rgd: the rgrp
+ *
+ */
+
+void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	struct gfs2_bitmap *bi = NULL;
+	u32 length = rgd->rd_ri.ri_length;
+	u32 count[4], tmp;
+	int buf, x;
+
+	memset(count, 0, 4 * sizeof(u32));
+
+	/* Count # blocks in each of 4 possible allocation states */
+	for (buf = 0; buf < length; buf++) {
+		bi = rgd->rd_bits + buf;
+		for (x = 0; x < 4; x++)
+			count[x] += gfs2_bitcount(rgd,
+						  bi->bi_bh->b_data +
+						  bi->bi_offset,
+						  bi->bi_len, x);
+	}
+
+	if (count[0] != rgd->rd_rg.rg_free) {
+		if (gfs2_consist_rgrpd(rgd))
+			fs_err(sdp, "free data mismatch:  %u != %u\n",
+			       count[0], rgd->rd_rg.rg_free);
+		return;
+	}
+
+	tmp = rgd->rd_ri.ri_data -
+		rgd->rd_rg.rg_free -
+		rgd->rd_rg.rg_dinodes;
+	if (count[1] + count[2] != tmp) {
+		if (gfs2_consist_rgrpd(rgd))
+			fs_err(sdp, "used data mismatch:  %u != %u\n",
+			       count[1], tmp);
+		return;
+	}
+
+	if (count[3] != rgd->rd_rg.rg_dinodes) {
+		if (gfs2_consist_rgrpd(rgd))
+			fs_err(sdp, "used metadata mismatch:  %u != %u\n",
+			       count[3], rgd->rd_rg.rg_dinodes);
+		return;
+	}
+
+	if (count[2] > count[3]) {
+		if (gfs2_consist_rgrpd(rgd))
+			fs_err(sdp, "unlinked inodes > inodes:  %u\n",
+			       count[2]);
+		return;
+	}
+
+}
+
+static inline int rgrp_contains_block(struct gfs2_rindex *ri, u64 block)
+{
+	u64 first = ri->ri_data0;
+	u64 last = first + ri->ri_data;
+	return first <= block && block < last;
+}
+
+/**
+ * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
+ * @sdp: The GFS2 superblock
+ * @n: The data block number
+ *
+ * Returns: The resource group, or NULL if not found
+ */
+
+struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk)
+{
+	struct gfs2_rgrpd *rgd;
+
+	spin_lock(&sdp->sd_rindex_spin);
+
+	list_for_each_entry(rgd, &sdp->sd_rindex_mru_list, rd_list_mru) {
+		if (rgrp_contains_block(&rgd->rd_ri, blk)) {
+			list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+			spin_unlock(&sdp->sd_rindex_spin);
+			return rgd;
+		}
+	}
+
+	spin_unlock(&sdp->sd_rindex_spin);
+
+	return NULL;
+}
+
+/**
+ * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: The first rgrp in the filesystem
+ */
+
+struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
+{
+	gfs2_assert(sdp, !list_empty(&sdp->sd_rindex_list));
+	return list_entry(sdp->sd_rindex_list.next, struct gfs2_rgrpd, rd_list);
+}
+
+/**
+ * gfs2_rgrpd_get_next - get the next RG
+ * @rgd: A RG
+ *
+ * Returns: The next rgrp
+ */
+
+struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
+{
+	if (rgd->rd_list.next == &rgd->rd_sbd->sd_rindex_list)
+		return NULL;
+	return list_entry(rgd->rd_list.next, struct gfs2_rgrpd, rd_list);
+}
+
+static void clear_rgrpdi(struct gfs2_sbd *sdp)
+{
+	struct list_head *head;
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_glock *gl;
+
+	spin_lock(&sdp->sd_rindex_spin);
+	sdp->sd_rindex_forward = NULL;
+	head = &sdp->sd_rindex_recent_list;
+	while (!list_empty(head)) {
+		rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
+		list_del(&rgd->rd_recent);
+	}
+	spin_unlock(&sdp->sd_rindex_spin);
+
+	head = &sdp->sd_rindex_list;
+	while (!list_empty(head)) {
+		rgd = list_entry(head->next, struct gfs2_rgrpd, rd_list);
+		gl = rgd->rd_gl;
+
+		list_del(&rgd->rd_list);
+		list_del(&rgd->rd_list_mru);
+
+		if (gl) {
+			gl->gl_object = NULL;
+			gfs2_glock_put(gl);
+		}
+
+		kfree(rgd->rd_bits);
+		kfree(rgd);
+	}
+}
+
+void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
+{
+	mutex_lock(&sdp->sd_rindex_mutex);
+	clear_rgrpdi(sdp);
+	mutex_unlock(&sdp->sd_rindex_mutex);
+}
+
+/**
+ * gfs2_compute_bitstructs - Compute the bitmap sizes
+ * @rgd: The resource group descriptor
+ *
+ * Calculates bitmap descriptors, one for each block that contains bitmap data
+ *
+ * Returns: errno
+ */
+
+static int compute_bitstructs(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	struct gfs2_bitmap *bi;
+	u32 length = rgd->rd_ri.ri_length; /* # blocks in hdr & bitmap */
+	u32 bytes_left, bytes;
+	int x;
+
+	if (!length)
+		return -EINVAL;
+
+	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
+	if (!rgd->rd_bits)
+		return -ENOMEM;
+
+	bytes_left = rgd->rd_ri.ri_bitbytes;
+
+	for (x = 0; x < length; x++) {
+		bi = rgd->rd_bits + x;
+
+		/* small rgrp; bitmap stored completely in header block */
+		if (length == 1) {
+			bytes = bytes_left;
+			bi->bi_offset = sizeof(struct gfs2_rgrp);
+			bi->bi_start = 0;
+			bi->bi_len = bytes;
+		/* header block */
+		} else if (x == 0) {
+			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
+			bi->bi_offset = sizeof(struct gfs2_rgrp);
+			bi->bi_start = 0;
+			bi->bi_len = bytes;
+		/* last block */
+		} else if (x + 1 == length) {
+			bytes = bytes_left;
+			bi->bi_offset = sizeof(struct gfs2_meta_header);
+			bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
+			bi->bi_len = bytes;
+		/* other blocks */
+		} else {
+			bytes = sdp->sd_sb.sb_bsize -
+				sizeof(struct gfs2_meta_header);
+			bi->bi_offset = sizeof(struct gfs2_meta_header);
+			bi->bi_start = rgd->rd_ri.ri_bitbytes - bytes_left;
+			bi->bi_len = bytes;
+		}
+
+		bytes_left -= bytes;
+	}
+
+	if (bytes_left) {
+		gfs2_consist_rgrpd(rgd);
+		return -EIO;
+	}
+	bi = rgd->rd_bits + (length - 1);
+	if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_ri.ri_data) {
+		if (gfs2_consist_rgrpd(rgd)) {
+			gfs2_rindex_print(&rgd->rd_ri);
+			fs_err(sdp, "start=%u len=%u offset=%u\n",
+			       bi->bi_start, bi->bi_len, bi->bi_offset);
+		}
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_ri_update - Pull in a new resource index from the disk
+ * @gl: The glock covering the rindex inode
+ *
+ * Returns: 0 on successful update, error code otherwise
+ */
+
+static int gfs2_ri_update(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct inode *inode = &ip->i_inode;
+	struct gfs2_rgrpd *rgd;
+	char buf[sizeof(struct gfs2_rindex)];
+	struct file_ra_state ra_state;
+	u64 junk = ip->i_di.di_size;
+	int error;
+
+	if (do_div(junk, sizeof(struct gfs2_rindex))) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+
+	clear_rgrpdi(sdp);
+
+	file_ra_state_init(&ra_state, inode->i_mapping);
+	for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
+		loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
+		error = gfs2_internal_read(ip, &ra_state, buf, &pos,
+					    sizeof(struct gfs2_rindex));
+		if (!error)
+			break;
+		if (error != sizeof(struct gfs2_rindex)) {
+			if (error > 0)
+				error = -EIO;
+			goto fail;
+		}
+
+		rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS);
+		error = -ENOMEM;
+		if (!rgd)
+			goto fail;
+
+		mutex_init(&rgd->rd_mutex);
+		lops_init_le(&rgd->rd_le, &gfs2_rg_lops);
+		rgd->rd_sbd = sdp;
+
+		list_add_tail(&rgd->rd_list, &sdp->sd_rindex_list);
+		list_add_tail(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+
+		gfs2_rindex_in(&rgd->rd_ri, buf);
+		error = compute_bitstructs(rgd);
+		if (error)
+			goto fail;
+
+		error = gfs2_glock_get(sdp, rgd->rd_ri.ri_addr,
+				       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
+		if (error)
+			goto fail;
+
+		rgd->rd_gl->gl_object = rgd;
+		rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1;
+	}
+
+	sdp->sd_rindex_vn = ip->i_gl->gl_vn;
+	return 0;
+
+fail:
+	clear_rgrpdi(sdp);
+	return error;
+}
+
+/**
+ * gfs2_rindex_hold - Grab a lock on the rindex
+ * @sdp: The GFS2 superblock
+ * @ri_gh: the glock holder
+ *
+ * We grab a lock on the rindex inode to make sure that it doesn't
+ * change whilst we are performing an operation. We keep this lock
+ * for quite long periods of time compared to other locks. This
+ * doesn't matter, since it is shared and it is very, very rarely
+ * accessed in the exclusive mode (i.e. only when expanding the filesystem).
+ *
+ * This makes sure that we're using the latest copy of the resource index
+ * special file, which might have been updated if someone expanded the
+ * filesystem (via gfs2_grow utility), which adds new resource groups.
+ *
+ * Returns: 0 on success, error code otherwise
+ */
+
+int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
+{
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
+	struct gfs2_glock *gl = ip->i_gl;
+	int error;
+
+	error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, ri_gh);
+	if (error)
+		return error;
+
+	/* Read new copy from disk if we don't have the latest */
+	if (sdp->sd_rindex_vn != gl->gl_vn) {
+		mutex_lock(&sdp->sd_rindex_mutex);
+		if (sdp->sd_rindex_vn != gl->gl_vn) {
+			error = gfs2_ri_update(ip);
+			if (error)
+				gfs2_glock_dq_uninit(ri_gh);
+		}
+		mutex_unlock(&sdp->sd_rindex_mutex);
+	}
+
+	return error;
+}
+
+/**
+ * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
+ * @rgd: the struct gfs2_rgrpd describing the RG to read in
+ *
+ * Read in all of a Resource Group's header and bitmap blocks.
+ * Caller must eventually call gfs2_rgrp_relse() to free the bitmaps.
+ *
+ * Returns: errno
+ */
+
+int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	struct gfs2_glock *gl = rgd->rd_gl;
+	unsigned int length = rgd->rd_ri.ri_length;
+	struct gfs2_bitmap *bi;
+	unsigned int x, y;
+	int error;
+
+	mutex_lock(&rgd->rd_mutex);
+
+	spin_lock(&sdp->sd_rindex_spin);
+	if (rgd->rd_bh_count) {
+		rgd->rd_bh_count++;
+		spin_unlock(&sdp->sd_rindex_spin);
+		mutex_unlock(&rgd->rd_mutex);
+		return 0;
+	}
+	spin_unlock(&sdp->sd_rindex_spin);
+
+	for (x = 0; x < length; x++) {
+		bi = rgd->rd_bits + x;
+		error = gfs2_meta_read(gl, rgd->rd_ri.ri_addr + x, 0, &bi->bi_bh);
+		if (error)
+			goto fail;
+	}
+
+	for (y = length; y--;) {
+		bi = rgd->rd_bits + y;
+		error = gfs2_meta_wait(sdp, bi->bi_bh);
+		if (error)
+			goto fail;
+		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
+					      GFS2_METATYPE_RG)) {
+			error = -EIO;
+			goto fail;
+		}
+	}
+
+	if (rgd->rd_rg_vn != gl->gl_vn) {
+		gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data);
+		rgd->rd_rg_vn = gl->gl_vn;
+	}
+
+	spin_lock(&sdp->sd_rindex_spin);
+	rgd->rd_free_clone = rgd->rd_rg.rg_free;
+	rgd->rd_bh_count++;
+	spin_unlock(&sdp->sd_rindex_spin);
+
+	mutex_unlock(&rgd->rd_mutex);
+
+	return 0;
+
+fail:
+	while (x--) {
+		bi = rgd->rd_bits + x;
+		brelse(bi->bi_bh);
+		bi->bi_bh = NULL;
+		gfs2_assert_warn(sdp, !bi->bi_clone);
+	}
+	mutex_unlock(&rgd->rd_mutex);
+
+	return error;
+}
+
+void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+
+	spin_lock(&sdp->sd_rindex_spin);
+	gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
+	rgd->rd_bh_count++;
+	spin_unlock(&sdp->sd_rindex_spin);
+}
+
+/**
+ * gfs2_rgrp_bh_put - Release RG bitmaps read in with gfs2_rgrp_bh_get()
+ * @rgd: the struct gfs2_rgrpd describing the RG to read in
+ *
+ */
+
+void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	int x, length = rgd->rd_ri.ri_length;
+
+	spin_lock(&sdp->sd_rindex_spin);
+	gfs2_assert_warn(rgd->rd_sbd, rgd->rd_bh_count);
+	if (--rgd->rd_bh_count) {
+		spin_unlock(&sdp->sd_rindex_spin);
+		return;
+	}
+
+	for (x = 0; x < length; x++) {
+		struct gfs2_bitmap *bi = rgd->rd_bits + x;
+		kfree(bi->bi_clone);
+		bi->bi_clone = NULL;
+		brelse(bi->bi_bh);
+		bi->bi_bh = NULL;
+	}
+
+	spin_unlock(&sdp->sd_rindex_spin);
+}
+
+void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	unsigned int length = rgd->rd_ri.ri_length;
+	unsigned int x;
+
+	for (x = 0; x < length; x++) {
+		struct gfs2_bitmap *bi = rgd->rd_bits + x;
+		if (!bi->bi_clone)
+			continue;
+		memcpy(bi->bi_clone + bi->bi_offset,
+		       bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
+	}
+
+	spin_lock(&sdp->sd_rindex_spin);
+	rgd->rd_free_clone = rgd->rd_rg.rg_free;
+	spin_unlock(&sdp->sd_rindex_spin);
+}
+
+/**
+ * gfs2_alloc_get - get the struct gfs2_alloc structure for an inode
+ * @ip: the incore GFS2 inode structure
+ *
+ * Returns: the struct gfs2_alloc
+ */
+
+struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip)
+{
+	struct gfs2_alloc *al = &ip->i_alloc;
+
+	/* FIXME: Should assert that the correct locks are held here... */
+	memset(al, 0, sizeof(*al));
+	return al;
+}
+
+/**
+ * try_rgrp_fit - See if a given reservation will fit in a given RG
+ * @rgd: the RG data
+ * @al: the struct gfs2_alloc structure describing the reservation
+ *
+ * If there's room for the requested blocks to be allocated from the RG:
+ *   Sets the $al_reserved_data field in @al.
+ *   Sets the $al_reserved_meta field in @al.
+ *   Sets the $al_rgd field in @al.
+ *
+ * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
+ */
+
+static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	int ret = 0;
+
+	spin_lock(&sdp->sd_rindex_spin);
+	if (rgd->rd_free_clone >= al->al_requested) {
+		al->al_rgd = rgd;
+		ret = 1;
+	}
+	spin_unlock(&sdp->sd_rindex_spin);
+
+	return ret;
+}
+
+/**
+ * recent_rgrp_first - get first RG from "recent" list
+ * @sdp: The GFS2 superblock
+ * @rglast: address of the rgrp used last
+ *
+ * Returns: The first rgrp in the recent list
+ */
+
+static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
+					    u64 rglast)
+{
+	struct gfs2_rgrpd *rgd = NULL;
+
+	spin_lock(&sdp->sd_rindex_spin);
+
+	if (list_empty(&sdp->sd_rindex_recent_list))
+		goto out;
+
+	if (!rglast)
+		goto first;
+
+	list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
+		if (rgd->rd_ri.ri_addr == rglast)
+			goto out;
+	}
+
+first:
+	rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd,
+			 rd_recent);
+out:
+	spin_unlock(&sdp->sd_rindex_spin);
+	return rgd;
+}
+
+/**
+ * recent_rgrp_next - get next RG from "recent" list
+ * @cur_rgd: current rgrp
+ * @remove:
+ *
+ * Returns: The next rgrp in the recent list
+ */
+
+static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
+					   int remove)
+{
+	struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
+	struct list_head *head;
+	struct gfs2_rgrpd *rgd;
+
+	spin_lock(&sdp->sd_rindex_spin);
+
+	head = &sdp->sd_rindex_recent_list;
+
+	list_for_each_entry(rgd, head, rd_recent) {
+		if (rgd == cur_rgd) {
+			if (cur_rgd->rd_recent.next != head)
+				rgd = list_entry(cur_rgd->rd_recent.next,
+						 struct gfs2_rgrpd, rd_recent);
+			else
+				rgd = NULL;
+
+			if (remove)
+				list_del(&cur_rgd->rd_recent);
+
+			goto out;
+		}
+	}
+
+	rgd = NULL;
+	if (!list_empty(head))
+		rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
+
+out:
+	spin_unlock(&sdp->sd_rindex_spin);
+	return rgd;
+}
+
+/**
+ * recent_rgrp_add - add an RG to tail of "recent" list
+ * @new_rgd: The rgrp to add
+ *
+ */
+
+static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
+{
+	struct gfs2_sbd *sdp = new_rgd->rd_sbd;
+	struct gfs2_rgrpd *rgd;
+	unsigned int count = 0;
+	unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
+
+	spin_lock(&sdp->sd_rindex_spin);
+
+	list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
+		if (rgd == new_rgd)
+			goto out;
+
+		if (++count >= max)
+			goto out;
+	}
+	list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
+
+out:
+	spin_unlock(&sdp->sd_rindex_spin);
+}
+
+/**
+ * forward_rgrp_get - get an rgrp to try next from full list
+ * @sdp: The GFS2 superblock
+ *
+ * Returns: The rgrp to try next
+ */
+
+static struct gfs2_rgrpd *forward_rgrp_get(struct gfs2_sbd *sdp)
+{
+	struct gfs2_rgrpd *rgd;
+	unsigned int journals = gfs2_jindex_size(sdp);
+	unsigned int rg = 0, x;
+
+	spin_lock(&sdp->sd_rindex_spin);
+
+	rgd = sdp->sd_rindex_forward;
+	if (!rgd) {
+		if (sdp->sd_rgrps >= journals)
+			rg = sdp->sd_rgrps * sdp->sd_jdesc->jd_jid / journals;
+
+		for (x = 0, rgd = gfs2_rgrpd_get_first(sdp); x < rg;
+		     x++, rgd = gfs2_rgrpd_get_next(rgd))
+			/* Do Nothing */;
+
+		sdp->sd_rindex_forward = rgd;
+	}
+
+	spin_unlock(&sdp->sd_rindex_spin);
+
+	return rgd;
+}
+
+/**
+ * forward_rgrp_set - set the forward rgrp pointer
+ * @sdp: the filesystem
+ * @rgd: The new forward rgrp
+ *
+ */
+
+static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd)
+{
+	spin_lock(&sdp->sd_rindex_spin);
+	sdp->sd_rindex_forward = rgd;
+	spin_unlock(&sdp->sd_rindex_spin);
+}
+
+/**
+ * get_local_rgrp - Choose and lock a rgrp for allocation
+ * @ip: the inode to reserve space for
+ * @rgp: the chosen and locked rgrp
+ *
+ * Try to acquire rgrp in way which avoids contending with others.
+ *
+ * Returns: errno
+ */
+
+static int get_local_rgrp(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrpd *rgd, *begin = NULL;
+	struct gfs2_alloc *al = &ip->i_alloc;
+	int flags = LM_FLAG_TRY;
+	int skipped = 0;
+	int loops = 0;
+	int error;
+
+	/* Try recently successful rgrps */
+
+	rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc);
+
+	while (rgd) {
+		error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
+					   LM_FLAG_TRY, &al->al_rgd_gh);
+		switch (error) {
+		case 0:
+			if (try_rgrp_fit(rgd, al))
+				goto out;
+			gfs2_glock_dq_uninit(&al->al_rgd_gh);
+			rgd = recent_rgrp_next(rgd, 1);
+			break;
+
+		case GLR_TRYFAILED:
+			rgd = recent_rgrp_next(rgd, 0);
+			break;
+
+		default:
+			return error;
+		}
+	}
+
+	/* Go through full list of rgrps */
+
+	begin = rgd = forward_rgrp_get(sdp);
+
+	for (;;) {
+		error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, flags,
+					  &al->al_rgd_gh);
+		switch (error) {
+		case 0:
+			if (try_rgrp_fit(rgd, al))
+				goto out;
+			gfs2_glock_dq_uninit(&al->al_rgd_gh);
+			break;
+
+		case GLR_TRYFAILED:
+			skipped++;
+			break;
+
+		default:
+			return error;
+		}
+
+		rgd = gfs2_rgrpd_get_next(rgd);
+		if (!rgd)
+			rgd = gfs2_rgrpd_get_first(sdp);
+
+		if (rgd == begin) {
+			if (++loops >= 2 || !skipped)
+				return -ENOSPC;
+			flags = 0;
+		}
+	}
+
+out:
+	ip->i_last_rg_alloc = rgd->rd_ri.ri_addr;
+
+	if (begin) {
+		recent_rgrp_add(rgd);
+		rgd = gfs2_rgrpd_get_next(rgd);
+		if (!rgd)
+			rgd = gfs2_rgrpd_get_first(sdp);
+		forward_rgrp_set(sdp, rgd);
+	}
+
+	return 0;
+}
+
+/**
+ * gfs2_inplace_reserve_i - Reserve space in the filesystem
+ * @ip: the inode to reserve space for
+ *
+ * Returns: errno
+ */
+
+int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al = &ip->i_alloc;
+	int error;
+
+	if (gfs2_assert_warn(sdp, al->al_requested))
+		return -EINVAL;
+
+	error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+	if (error)
+		return error;
+
+	error = get_local_rgrp(ip);
+	if (error) {
+		gfs2_glock_dq_uninit(&al->al_ri_gh);
+		return error;
+	}
+
+	al->al_file = file;
+	al->al_line = line;
+
+	return 0;
+}
+
+/**
+ * gfs2_inplace_release - release an inplace reservation
+ * @ip: the inode the reservation was taken out on
+ *
+ * Release a reservation made by gfs2_inplace_reserve().
+ */
+
+void gfs2_inplace_release(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al = &ip->i_alloc;
+
+	if (gfs2_assert_warn(sdp, al->al_alloced <= al->al_requested) == -1)
+		fs_warn(sdp, "al_alloced = %u, al_requested = %u "
+			     "al_file = %s, al_line = %u\n",
+		             al->al_alloced, al->al_requested, al->al_file,
+			     al->al_line);
+
+	al->al_rgd = NULL;
+	gfs2_glock_dq_uninit(&al->al_rgd_gh);
+	gfs2_glock_dq_uninit(&al->al_ri_gh);
+}
+
+/**
+ * gfs2_get_block_type - Check a block in a RG is of given type
+ * @rgd: the resource group holding the block
+ * @block: the block number
+ *
+ * Returns: The block type (GFS2_BLKST_*)
+ */
+
+unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
+{
+	struct gfs2_bitmap *bi = NULL;
+	u32 length, rgrp_block, buf_block;
+	unsigned int buf;
+	unsigned char type;
+
+	length = rgd->rd_ri.ri_length;
+	rgrp_block = block - rgd->rd_ri.ri_data0;
+
+	for (buf = 0; buf < length; buf++) {
+		bi = rgd->rd_bits + buf;
+		if (rgrp_block < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
+			break;
+	}
+
+	gfs2_assert(rgd->rd_sbd, buf < length);
+	buf_block = rgrp_block - bi->bi_start * GFS2_NBBY;
+
+	type = gfs2_testbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+			   bi->bi_len, buf_block);
+
+	return type;
+}
+
+/**
+ * rgblk_search - find a block in @old_state, change allocation
+ *           state to @new_state
+ * @rgd: the resource group descriptor
+ * @goal: the goal block within the RG (start here to search for avail block)
+ * @old_state: GFS2_BLKST_XXX the before-allocation state to find
+ * @new_state: GFS2_BLKST_XXX the after-allocation block state
+ *
+ * Walk rgrp's bitmap to find bits that represent a block in @old_state.
+ * Add the found bitmap buffer to the transaction.
+ * Set the found bits to @new_state to change block's allocation state.
+ *
+ * This function never fails, because we wouldn't call it unless we
+ * know (from reservation results, etc.) that a block is available.
+ *
+ * Scope of @goal and returned block is just within rgrp, not the whole
+ * filesystem.
+ *
+ * Returns:  the block number allocated
+ */
+
+static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
+			     unsigned char old_state, unsigned char new_state)
+{
+	struct gfs2_bitmap *bi = NULL;
+	u32 length = rgd->rd_ri.ri_length;
+	u32 blk = 0;
+	unsigned int buf, x;
+
+	/* Find bitmap block that contains bits for goal block */
+	for (buf = 0; buf < length; buf++) {
+		bi = rgd->rd_bits + buf;
+		if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
+			break;
+	}
+
+	gfs2_assert(rgd->rd_sbd, buf < length);
+
+	/* Convert scope of "goal" from rgrp-wide to within found bit block */
+	goal -= bi->bi_start * GFS2_NBBY;
+
+	/* Search (up to entire) bitmap in this rgrp for allocatable block.
+	   "x <= length", instead of "x < length", because we typically start
+	   the search in the middle of a bit block, but if we can't find an
+	   allocatable block anywhere else, we want to be able wrap around and
+	   search in the first part of our first-searched bit block.  */
+	for (x = 0; x <= length; x++) {
+		if (bi->bi_clone)
+			blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset,
+					  bi->bi_len, goal, old_state);
+		else
+			blk = gfs2_bitfit(rgd,
+					  bi->bi_bh->b_data + bi->bi_offset,
+					  bi->bi_len, goal, old_state);
+		if (blk != BFITNOENT)
+			break;
+
+		/* Try next bitmap block (wrap back to rgrp header if at end) */
+		buf = (buf + 1) % length;
+		bi = rgd->rd_bits + buf;
+		goal = 0;
+	}
+
+	if (gfs2_assert_withdraw(rgd->rd_sbd, x <= length))
+		blk = 0;
+
+	gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+	gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+		    bi->bi_len, blk, new_state);
+	if (bi->bi_clone)
+		gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset,
+			    bi->bi_len, blk, new_state);
+
+	return bi->bi_start * GFS2_NBBY + blk;
+}
+
+/**
+ * rgblk_free - Change alloc state of given block(s)
+ * @sdp: the filesystem
+ * @bstart: the start of a run of blocks to free
+ * @blen: the length of the block run (all must lie within ONE RG!)
+ * @new_state: GFS2_BLKST_XXX the after-allocation block state
+ *
+ * Returns:  Resource group containing the block(s)
+ */
+
+static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
+				     u32 blen, unsigned char new_state)
+{
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_bitmap *bi = NULL;
+	u32 length, rgrp_blk, buf_blk;
+	unsigned int buf;
+
+	rgd = gfs2_blk2rgrpd(sdp, bstart);
+	if (!rgd) {
+		if (gfs2_consist(sdp))
+			fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
+		return NULL;
+	}
+
+	length = rgd->rd_ri.ri_length;
+
+	rgrp_blk = bstart - rgd->rd_ri.ri_data0;
+
+	while (blen--) {
+		for (buf = 0; buf < length; buf++) {
+			bi = rgd->rd_bits + buf;
+			if (rgrp_blk < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
+				break;
+		}
+
+		gfs2_assert(rgd->rd_sbd, buf < length);
+
+		buf_blk = rgrp_blk - bi->bi_start * GFS2_NBBY;
+		rgrp_blk++;
+
+		if (!bi->bi_clone) {
+			bi->bi_clone = kmalloc(bi->bi_bh->b_size,
+					       GFP_NOFS | __GFP_NOFAIL);
+			memcpy(bi->bi_clone + bi->bi_offset,
+			       bi->bi_bh->b_data + bi->bi_offset,
+			       bi->bi_len);
+		}
+		gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+		gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
+			    bi->bi_len, buf_blk, new_state);
+	}
+
+	return rgd;
+}
+
+/**
+ * gfs2_alloc_data - Allocate a data block
+ * @ip: the inode to allocate the data block for
+ *
+ * Returns: the allocated block
+ */
+
+u64 gfs2_alloc_data(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_rgrpd *rgd = al->al_rgd;
+	u32 goal, blk;
+	u64 block;
+
+	if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_data))
+		goal = ip->i_di.di_goal_data - rgd->rd_ri.ri_data0;
+	else
+		goal = rgd->rd_last_alloc_data;
+
+	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
+	rgd->rd_last_alloc_data = blk;
+
+	block = rgd->rd_ri.ri_data0 + blk;
+	ip->i_di.di_goal_data = block;
+
+	gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
+	rgd->rd_rg.rg_free--;
+
+	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+	gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+
+	al->al_alloced++;
+
+	gfs2_statfs_change(sdp, 0, -1, 0);
+	gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
+
+	spin_lock(&sdp->sd_rindex_spin);
+	rgd->rd_free_clone--;
+	spin_unlock(&sdp->sd_rindex_spin);
+
+	return block;
+}
+
+/**
+ * gfs2_alloc_meta - Allocate a metadata block
+ * @ip: the inode to allocate the metadata block for
+ *
+ * Returns: the allocated block
+ */
+
+u64 gfs2_alloc_meta(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_alloc *al = &ip->i_alloc;
+	struct gfs2_rgrpd *rgd = al->al_rgd;
+	u32 goal, blk;
+	u64 block;
+
+	if (rgrp_contains_block(&rgd->rd_ri, ip->i_di.di_goal_meta))
+		goal = ip->i_di.di_goal_meta - rgd->rd_ri.ri_data0;
+	else
+		goal = rgd->rd_last_alloc_meta;
+
+	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
+	rgd->rd_last_alloc_meta = blk;
+
+	block = rgd->rd_ri.ri_data0 + blk;
+	ip->i_di.di_goal_meta = block;
+
+	gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
+	rgd->rd_rg.rg_free--;
+
+	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+	gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+
+	al->al_alloced++;
+
+	gfs2_statfs_change(sdp, 0, -1, 0);
+	gfs2_quota_change(ip, +1, ip->i_di.di_uid, ip->i_di.di_gid);
+	gfs2_trans_add_unrevoke(sdp, block);
+
+	spin_lock(&sdp->sd_rindex_spin);
+	rgd->rd_free_clone--;
+	spin_unlock(&sdp->sd_rindex_spin);
+
+	return block;
+}
+
+/**
+ * gfs2_alloc_di - Allocate a dinode
+ * @dip: the directory that the inode is going in
+ *
+ * Returns: the block allocated
+ */
+
+u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct gfs2_alloc *al = &dip->i_alloc;
+	struct gfs2_rgrpd *rgd = al->al_rgd;
+	u32 blk;
+	u64 block;
+
+	blk = rgblk_search(rgd, rgd->rd_last_alloc_meta,
+			   GFS2_BLKST_FREE, GFS2_BLKST_DINODE);
+
+	rgd->rd_last_alloc_meta = blk;
+
+	block = rgd->rd_ri.ri_data0 + blk;
+
+	gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
+	rgd->rd_rg.rg_free--;
+	rgd->rd_rg.rg_dinodes++;
+	*generation = rgd->rd_rg.rg_igeneration++;
+	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+	gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+
+	al->al_alloced++;
+
+	gfs2_statfs_change(sdp, 0, -1, +1);
+	gfs2_trans_add_unrevoke(sdp, block);
+
+	spin_lock(&sdp->sd_rindex_spin);
+	rgd->rd_free_clone--;
+	spin_unlock(&sdp->sd_rindex_spin);
+
+	return block;
+}
+
+/**
+ * gfs2_free_data - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+
+void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrpd *rgd;
+
+	rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
+	if (!rgd)
+		return;
+
+	rgd->rd_rg.rg_free += blen;
+
+	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+	gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+
+	gfs2_trans_add_rg(rgd);
+
+	gfs2_statfs_change(sdp, 0, +blen, 0);
+	gfs2_quota_change(ip, -(s64)blen,
+			 ip->i_di.di_uid, ip->i_di.di_gid);
+}
+
+/**
+ * gfs2_free_meta - free a contiguous run of data block(s)
+ * @ip: the inode these blocks are being freed from
+ * @bstart: first block of a run of contiguous blocks
+ * @blen: the length of the block run
+ *
+ */
+
+void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct gfs2_rgrpd *rgd;
+
+	rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
+	if (!rgd)
+		return;
+
+	rgd->rd_rg.rg_free += blen;
+
+	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+	gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+
+	gfs2_trans_add_rg(rgd);
+
+	gfs2_statfs_change(sdp, 0, +blen, 0);
+	gfs2_quota_change(ip, -(s64)blen, ip->i_di.di_uid, ip->i_di.di_gid);
+	gfs2_meta_wipe(ip, bstart, blen);
+}
+
+void gfs2_unlink_di(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_rgrpd *rgd;
+	u64 blkno = ip->i_num.no_addr;
+
+	rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
+	if (!rgd)
+		return;
+	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+	gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+	gfs2_trans_add_rg(rgd);
+}
+
+static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	struct gfs2_rgrpd *tmp_rgd;
+
+	tmp_rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_FREE);
+	if (!tmp_rgd)
+		return;
+	gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
+
+	if (!rgd->rd_rg.rg_dinodes)
+		gfs2_consist_rgrpd(rgd);
+	rgd->rd_rg.rg_dinodes--;
+	rgd->rd_rg.rg_free++;
+
+	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+	gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
+
+	gfs2_statfs_change(sdp, 0, +1, -1);
+	gfs2_trans_add_rg(rgd);
+}
+
+
+void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
+{
+	gfs2_free_uninit_di(rgd, ip->i_num.no_addr);
+	gfs2_quota_change(ip, -1, ip->i_di.di_uid, ip->i_di.di_gid);
+	gfs2_meta_wipe(ip, ip->i_num.no_addr, 1);
+}
+
+/**
+ * gfs2_rlist_add - add a RG to a list of RGs
+ * @sdp: the filesystem
+ * @rlist: the list of resource groups
+ * @block: the block
+ *
+ * Figure out what RG a block belongs to and add that RG to the list
+ *
+ * FIXME: Don't use NOFAIL
+ *
+ */
+
+void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+		    u64 block)
+{
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_rgrpd **tmp;
+	unsigned int new_space;
+	unsigned int x;
+
+	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
+		return;
+
+	rgd = gfs2_blk2rgrpd(sdp, block);
+	if (!rgd) {
+		if (gfs2_consist(sdp))
+			fs_err(sdp, "block = %llu\n", (unsigned long long)block);
+		return;
+	}
+
+	for (x = 0; x < rlist->rl_rgrps; x++)
+		if (rlist->rl_rgd[x] == rgd)
+			return;
+
+	if (rlist->rl_rgrps == rlist->rl_space) {
+		new_space = rlist->rl_space + 10;
+
+		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
+			      GFP_NOFS | __GFP_NOFAIL);
+
+		if (rlist->rl_rgd) {
+			memcpy(tmp, rlist->rl_rgd,
+			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
+			kfree(rlist->rl_rgd);
+		}
+
+		rlist->rl_space = new_space;
+		rlist->rl_rgd = tmp;
+	}
+
+	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
+}
+
+/**
+ * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
+ *      and initialize an array of glock holders for them
+ * @rlist: the list of resource groups
+ * @state: the lock state to acquire the RG lock in
+ * @flags: the modifier flags for the holder structures
+ *
+ * FIXME: Don't use NOFAIL
+ *
+ */
+
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
+		      int flags)
+{
+	unsigned int x;
+
+	rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder),
+				GFP_NOFS | __GFP_NOFAIL);
+	for (x = 0; x < rlist->rl_rgrps; x++)
+		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
+				state, flags,
+				&rlist->rl_ghs[x]);
+}
+
+/**
+ * gfs2_rlist_free - free a resource group list
+ * @list: the list of resource groups
+ *
+ */
+
+void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
+{
+	unsigned int x;
+
+	kfree(rlist->rl_rgd);
+
+	if (rlist->rl_ghs) {
+		for (x = 0; x < rlist->rl_rgrps; x++)
+			gfs2_holder_uninit(&rlist->rl_ghs[x]);
+		kfree(rlist->rl_ghs);
+	}
+}
+
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
new file mode 100644
index 000000000000..b01e0cfc99b5
--- /dev/null
+++ b/fs/gfs2/rgrp.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __RGRP_DOT_H__
+#define __RGRP_DOT_H__
+
+struct gfs2_rgrpd;
+struct gfs2_sbd;
+struct gfs2_holder;
+
+void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
+
+struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
+struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
+struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
+
+void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
+int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
+
+int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
+void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
+void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
+
+void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
+
+struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
+static inline void gfs2_alloc_put(struct gfs2_inode *ip)
+{
+	return; /* So we can see where ip->i_alloc is used */
+}
+
+int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
+			 char *file, unsigned int line);
+#define gfs2_inplace_reserve(ip) \
+gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
+
+void gfs2_inplace_release(struct gfs2_inode *ip);
+
+unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
+
+u64 gfs2_alloc_data(struct gfs2_inode *ip);
+u64 gfs2_alloc_meta(struct gfs2_inode *ip);
+u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
+
+void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
+void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
+void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
+void gfs2_unlink_di(struct inode *inode);
+
+struct gfs2_rgrp_list {
+	unsigned int rl_rgrps;
+	unsigned int rl_space;
+	struct gfs2_rgrpd **rl_rgd;
+	struct gfs2_holder *rl_ghs;
+};
+
+void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+		    u64 block);
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
+		      int flags);
+void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
+
+#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
new file mode 100644
index 000000000000..6a78b1b32e25
--- /dev/null
+++ b/fs/gfs2/super.c
@@ -0,0 +1,976 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/crc32.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/bio.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "recovery.h"
+#include "rgrp.h"
+#include "super.h"
+#include "trans.h"
+#include "util.h"
+
+static const u32 gfs2_old_fs_formats[] = {
+        0
+};
+
+static const u32 gfs2_old_multihost_formats[] = {
+        0
+};
+
+/**
+ * gfs2_tune_init - Fill a gfs2_tune structure with default values
+ * @gt: tune
+ *
+ */
+
+void gfs2_tune_init(struct gfs2_tune *gt)
+{
+	spin_lock_init(&gt->gt_spin);
+
+	gt->gt_ilimit = 100;
+	gt->gt_ilimit_tries = 3;
+	gt->gt_ilimit_min = 1;
+	gt->gt_demote_secs = 300;
+	gt->gt_incore_log_blocks = 1024;
+	gt->gt_log_flush_secs = 60;
+	gt->gt_jindex_refresh_secs = 60;
+	gt->gt_scand_secs = 15;
+	gt->gt_recoverd_secs = 60;
+	gt->gt_logd_secs = 1;
+	gt->gt_quotad_secs = 5;
+	gt->gt_quota_simul_sync = 64;
+	gt->gt_quota_warn_period = 10;
+	gt->gt_quota_scale_num = 1;
+	gt->gt_quota_scale_den = 1;
+	gt->gt_quota_cache_secs = 300;
+	gt->gt_quota_quantum = 60;
+	gt->gt_atime_quantum = 3600;
+	gt->gt_new_files_jdata = 0;
+	gt->gt_new_files_directio = 0;
+	gt->gt_max_atomic_write = 4 << 20;
+	gt->gt_max_readahead = 1 << 18;
+	gt->gt_lockdump_size = 131072;
+	gt->gt_stall_secs = 600;
+	gt->gt_complain_secs = 10;
+	gt->gt_reclaim_limit = 5000;
+	gt->gt_entries_per_readdir = 32;
+	gt->gt_prefetch_secs = 10;
+	gt->gt_greedy_default = HZ / 10;
+	gt->gt_greedy_quantum = HZ / 40;
+	gt->gt_greedy_max = HZ / 4;
+	gt->gt_statfs_quantum = 30;
+	gt->gt_statfs_slow = 0;
+}
+
+/**
+ * gfs2_check_sb - Check superblock
+ * @sdp: the filesystem
+ * @sb: The superblock
+ * @silent: Don't print a message if the check fails
+ *
+ * Checks the version code of the FS is one that we understand how to
+ * read and that the sizes of the various on-disk structures have not
+ * changed.
+ */
+
+int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent)
+{
+	unsigned int x;
+
+	if (sb->sb_header.mh_magic != GFS2_MAGIC ||
+	    sb->sb_header.mh_type != GFS2_METATYPE_SB) {
+		if (!silent)
+			printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
+		return -EINVAL;
+	}
+
+	/*  If format numbers match exactly, we're done.  */
+
+	if (sb->sb_fs_format == GFS2_FORMAT_FS &&
+	    sb->sb_multihost_format == GFS2_FORMAT_MULTI)
+		return 0;
+
+	if (sb->sb_fs_format != GFS2_FORMAT_FS) {
+		for (x = 0; gfs2_old_fs_formats[x]; x++)
+			if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
+				break;
+
+		if (!gfs2_old_fs_formats[x]) {
+			printk(KERN_WARNING
+			       "GFS2: code version (%u, %u) is incompatible "
+			       "with ondisk format (%u, %u)\n",
+			       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+			       sb->sb_fs_format, sb->sb_multihost_format);
+			printk(KERN_WARNING
+			       "GFS2: I don't know how to upgrade this FS\n");
+			return -EINVAL;
+		}
+	}
+
+	if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
+		for (x = 0; gfs2_old_multihost_formats[x]; x++)
+			if (gfs2_old_multihost_formats[x] ==
+			    sb->sb_multihost_format)
+				break;
+
+		if (!gfs2_old_multihost_formats[x]) {
+			printk(KERN_WARNING
+			       "GFS2: code version (%u, %u) is incompatible "
+			       "with ondisk format (%u, %u)\n",
+			       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+			       sb->sb_fs_format, sb->sb_multihost_format);
+			printk(KERN_WARNING
+			       "GFS2: I don't know how to upgrade this FS\n");
+			return -EINVAL;
+		}
+	}
+
+	if (!sdp->sd_args.ar_upgrade) {
+		printk(KERN_WARNING
+		       "GFS2: code version (%u, %u) is incompatible "
+		       "with ondisk format (%u, %u)\n",
+		       GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+		       sb->sb_fs_format, sb->sb_multihost_format);
+		printk(KERN_INFO
+		       "GFS2: Use the \"upgrade\" mount option to upgrade "
+		       "the FS\n");
+		printk(KERN_INFO "GFS2: See the manual for more details\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+
+static int end_bio_io_page(struct bio *bio, unsigned int bytes_done, int error)
+{
+	struct page *page = bio->bi_private;
+	if (bio->bi_size)
+		return 1;
+
+	if (!error)
+		SetPageUptodate(page);
+	else
+		printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
+	unlock_page(page);
+	return 0;
+}
+
+struct page *gfs2_read_super(struct super_block *sb, sector_t sector)
+{
+	struct page *page;
+	struct bio *bio;
+
+	page = alloc_page(GFP_KERNEL);
+	if (unlikely(!page))
+		return NULL;
+
+	ClearPageUptodate(page);
+	ClearPageDirty(page);
+	lock_page(page);
+
+	bio = bio_alloc(GFP_KERNEL, 1);
+	if (unlikely(!bio)) {
+		__free_page(page);
+		return NULL;
+	}
+
+	bio->bi_sector = sector;
+	bio->bi_bdev = sb->s_bdev;
+	bio_add_page(bio, page, PAGE_SIZE, 0);
+
+	bio->bi_end_io = end_bio_io_page;
+	bio->bi_private = page;
+	submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
+	wait_on_page_locked(page);
+	bio_put(bio);
+	if (!PageUptodate(page)) {
+		__free_page(page);
+		return NULL;
+	}
+	return page;
+}
+
+/**
+ * gfs2_read_sb - Read super block
+ * @sdp: The GFS2 superblock
+ * @gl: the glock for the superblock (assumed to be held)
+ * @silent: Don't print message if mount fails
+ *
+ */
+
+int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
+{
+	u32 hash_blocks, ind_blocks, leaf_blocks;
+	u32 tmp_blocks;
+	unsigned int x;
+	int error;
+	struct page *page;
+	char *sb;
+
+	page = gfs2_read_super(sdp->sd_vfs, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
+	if (!page) {
+		if (!silent)
+			fs_err(sdp, "can't read superblock\n");
+		return -EIO;
+	}
+	sb = kmap(page);
+	gfs2_sb_in(&sdp->sd_sb, sb);
+	kunmap(page);
+	__free_page(page);
+
+	error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
+	if (error)
+		return error;
+
+	sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
+			       GFS2_BASIC_BLOCK_SHIFT;
+	sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+	sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
+			  sizeof(struct gfs2_dinode)) / sizeof(u64);
+	sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
+			  sizeof(struct gfs2_meta_header)) / sizeof(u64);
+	sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
+	sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
+	sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
+	sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
+	sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
+				sizeof(struct gfs2_meta_header)) /
+			        sizeof(struct gfs2_quota_change);
+
+	/* Compute maximum reservation required to add a entry to a directory */
+
+	hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
+			     sdp->sd_jbsize);
+
+	ind_blocks = 0;
+	for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
+		tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
+		ind_blocks += tmp_blocks;
+	}
+
+	leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
+
+	sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
+
+	sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
+				sizeof(struct gfs2_dinode);
+	sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
+	for (x = 2;; x++) {
+		u64 space, d;
+		u32 m;
+
+		space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
+		d = space;
+		m = do_div(d, sdp->sd_inptrs);
+
+		if (d != sdp->sd_heightsize[x - 1] || m)
+			break;
+		sdp->sd_heightsize[x] = space;
+	}
+	sdp->sd_max_height = x;
+	gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
+
+	sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
+				 sizeof(struct gfs2_dinode);
+	sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
+	for (x = 2;; x++) {
+		u64 space, d;
+		u32 m;
+
+		space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
+		d = space;
+		m = do_div(d, sdp->sd_inptrs);
+
+		if (d != sdp->sd_jheightsize[x - 1] || m)
+			break;
+		sdp->sd_jheightsize[x] = space;
+	}
+	sdp->sd_max_jheight = x;
+	gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
+
+	return 0;
+}
+
+/**
+ * gfs2_jindex_hold - Grab a lock on the jindex
+ * @sdp: The GFS2 superblock
+ * @ji_gh: the holder for the jindex glock
+ *
+ * This is very similar to the gfs2_rindex_hold() function, except that
+ * in general we hold the jindex lock for longer periods of time and
+ * we grab it far less frequently (in general) then the rgrp lock.
+ *
+ * Returns: errno
+ */
+
+int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
+{
+	struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
+	struct qstr name;
+	char buf[20];
+	struct gfs2_jdesc *jd;
+	int error;
+
+	name.name = buf;
+
+	mutex_lock(&sdp->sd_jindex_mutex);
+
+	for (;;) {
+		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED,
+					   GL_LOCAL_EXCL, ji_gh);
+		if (error)
+			break;
+
+		name.len = sprintf(buf, "journal%u", sdp->sd_journals);
+		name.hash = gfs2_disk_hash(name.name, name.len);
+
+		error = gfs2_dir_search(sdp->sd_jindex, &name, NULL, NULL);
+		if (error == -ENOENT) {
+			error = 0;
+			break;
+		}
+
+		gfs2_glock_dq_uninit(ji_gh);
+
+		if (error)
+			break;
+
+		error = -ENOMEM;
+		jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
+		if (!jd)
+			break;
+
+		jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
+		if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
+			if (!jd->jd_inode)
+				error = -ENOENT;
+			else
+				error = PTR_ERR(jd->jd_inode);
+			kfree(jd);
+			break;
+		}
+
+		spin_lock(&sdp->sd_jindex_spin);
+		jd->jd_jid = sdp->sd_journals++;
+		list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
+		spin_unlock(&sdp->sd_jindex_spin);
+	}
+
+	mutex_unlock(&sdp->sd_jindex_mutex);
+
+	return error;
+}
+
+/**
+ * gfs2_jindex_free - Clear all the journal index information
+ * @sdp: The GFS2 superblock
+ *
+ */
+
+void gfs2_jindex_free(struct gfs2_sbd *sdp)
+{
+	struct list_head list;
+	struct gfs2_jdesc *jd;
+
+	spin_lock(&sdp->sd_jindex_spin);
+	list_add(&list, &sdp->sd_jindex_list);
+	list_del_init(&sdp->sd_jindex_list);
+	sdp->sd_journals = 0;
+	spin_unlock(&sdp->sd_jindex_spin);
+
+	while (!list_empty(&list)) {
+		jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
+		list_del(&jd->jd_list);
+		iput(jd->jd_inode);
+		kfree(jd);
+	}
+}
+
+static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid)
+{
+	struct gfs2_jdesc *jd;
+	int found = 0;
+
+	list_for_each_entry(jd, head, jd_list) {
+		if (jd->jd_jid == jid) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found)
+		jd = NULL;
+
+	return jd;
+}
+
+struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
+{
+	struct gfs2_jdesc *jd;
+
+	spin_lock(&sdp->sd_jindex_spin);
+	jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
+	spin_unlock(&sdp->sd_jindex_spin);
+
+	return jd;
+}
+
+void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
+{
+	struct gfs2_jdesc *jd;
+
+	spin_lock(&sdp->sd_jindex_spin);
+	jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
+	if (jd)
+		jd->jd_dirty = 1;
+	spin_unlock(&sdp->sd_jindex_spin);
+}
+
+struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
+{
+	struct gfs2_jdesc *jd;
+	int found = 0;
+
+	spin_lock(&sdp->sd_jindex_spin);
+
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		if (jd->jd_dirty) {
+			jd->jd_dirty = 0;
+			found = 1;
+			break;
+		}
+	}
+	spin_unlock(&sdp->sd_jindex_spin);
+
+	if (!found)
+		jd = NULL;
+
+	return jd;
+}
+
+int gfs2_jdesc_check(struct gfs2_jdesc *jd)
+{
+	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
+	int ar;
+	int error;
+
+	if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) ||
+	    (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) {
+		gfs2_consist_inode(ip);
+		return -EIO;
+	}
+	jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+
+	error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar);
+	if (!error && ar) {
+		gfs2_consist_inode(ip);
+		error = -EIO;
+	}
+
+	return error;
+}
+
+/**
+ * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+
+int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
+{
+	struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
+	struct gfs2_glock *j_gl = ip->i_gl;
+	struct gfs2_holder t_gh;
+	struct gfs2_log_header head;
+	int error;
+
+	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
+				   GL_LOCAL_EXCL, &t_gh);
+	if (error)
+		return error;
+
+	gfs2_meta_cache_flush(ip);
+	j_gl->gl_ops->go_inval(j_gl, DIO_METADATA | DIO_DATA);
+
+	error = gfs2_find_jhead(sdp->sd_jdesc, &head);
+	if (error)
+		goto fail;
+
+	if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+		gfs2_consist(sdp);
+		error = -EIO;
+		goto fail;
+	}
+
+	/*  Initialize some head of the log stuff  */
+	sdp->sd_log_sequence = head.lh_sequence + 1;
+	gfs2_log_pointers_init(sdp, head.lh_blkno);
+
+	error = gfs2_quota_init(sdp);
+	if (error)
+		goto fail;
+
+	set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+
+	gfs2_glock_dq_uninit(&t_gh);
+
+	return 0;
+
+fail:
+	t_gh.gh_flags |= GL_NOCACHE;
+	gfs2_glock_dq_uninit(&t_gh);
+
+	return error;
+}
+
+/**
+ * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+
+int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+{
+	struct gfs2_holder t_gh;
+	int error;
+
+	gfs2_quota_sync(sdp);
+	gfs2_statfs_sync(sdp);
+
+	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
+				GL_LOCAL_EXCL | GL_NOCACHE,
+				&t_gh);
+	if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		return error;
+
+	gfs2_meta_syncfs(sdp);
+	gfs2_log_shutdown(sdp);
+
+	clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+
+	if (t_gh.gh_gl)
+		gfs2_glock_dq_uninit(&t_gh);
+
+	gfs2_quota_cleanup(sdp);
+
+	return error;
+}
+
+int gfs2_statfs_init(struct gfs2_sbd *sdp)
+{
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+	struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+	struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+	struct buffer_head *m_bh, *l_bh;
+	struct gfs2_holder gh;
+	int error;
+
+	error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
+				   &gh);
+	if (error)
+		return error;
+
+	error = gfs2_meta_inode_buffer(m_ip, &m_bh);
+	if (error)
+		goto out;
+
+	if (sdp->sd_args.ar_spectator) {
+		spin_lock(&sdp->sd_statfs_spin);
+		gfs2_statfs_change_in(m_sc, m_bh->b_data +
+				      sizeof(struct gfs2_dinode));
+		spin_unlock(&sdp->sd_statfs_spin);
+	} else {
+		error = gfs2_meta_inode_buffer(l_ip, &l_bh);
+		if (error)
+			goto out_m_bh;
+
+		spin_lock(&sdp->sd_statfs_spin);
+		gfs2_statfs_change_in(m_sc, m_bh->b_data +
+				      sizeof(struct gfs2_dinode));
+		gfs2_statfs_change_in(l_sc, l_bh->b_data +
+				      sizeof(struct gfs2_dinode));
+		spin_unlock(&sdp->sd_statfs_spin);
+
+		brelse(l_bh);
+	}
+
+out_m_bh:
+	brelse(m_bh);
+out:
+	gfs2_glock_dq_uninit(&gh);
+	return 0;
+}
+
+void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
+			s64 dinodes)
+{
+	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+	struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+	struct buffer_head *l_bh;
+	int error;
+
+	error = gfs2_meta_inode_buffer(l_ip, &l_bh);
+	if (error)
+		return;
+
+	mutex_lock(&sdp->sd_statfs_mutex);
+	gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+	mutex_unlock(&sdp->sd_statfs_mutex);
+
+	spin_lock(&sdp->sd_statfs_spin);
+	l_sc->sc_total += total;
+	l_sc->sc_free += free;
+	l_sc->sc_dinodes += dinodes;
+	gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode));
+	spin_unlock(&sdp->sd_statfs_spin);
+
+	brelse(l_bh);
+}
+
+int gfs2_statfs_sync(struct gfs2_sbd *sdp)
+{
+	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+	struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+	struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+	struct gfs2_holder gh;
+	struct buffer_head *m_bh, *l_bh;
+	int error;
+
+	error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE,
+				   &gh);
+	if (error)
+		return error;
+
+	error = gfs2_meta_inode_buffer(m_ip, &m_bh);
+	if (error)
+		goto out;
+
+	spin_lock(&sdp->sd_statfs_spin);
+	gfs2_statfs_change_in(m_sc, m_bh->b_data +
+			      sizeof(struct gfs2_dinode));
+	if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) {
+		spin_unlock(&sdp->sd_statfs_spin);
+		goto out_bh;
+	}
+	spin_unlock(&sdp->sd_statfs_spin);
+
+	error = gfs2_meta_inode_buffer(l_ip, &l_bh);
+	if (error)
+		goto out_bh;
+
+	error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
+	if (error)
+		goto out_bh2;
+
+	mutex_lock(&sdp->sd_statfs_mutex);
+	gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+	mutex_unlock(&sdp->sd_statfs_mutex);
+
+	spin_lock(&sdp->sd_statfs_spin);
+	m_sc->sc_total += l_sc->sc_total;
+	m_sc->sc_free += l_sc->sc_free;
+	m_sc->sc_dinodes += l_sc->sc_dinodes;
+	memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
+	memset(l_bh->b_data + sizeof(struct gfs2_dinode),
+	       0, sizeof(struct gfs2_statfs_change));
+	spin_unlock(&sdp->sd_statfs_spin);
+
+	gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
+	gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
+
+	gfs2_trans_end(sdp);
+
+out_bh2:
+	brelse(l_bh);
+out_bh:
+	brelse(m_bh);
+out:
+	gfs2_glock_dq_uninit(&gh);
+	return error;
+}
+
+/**
+ * gfs2_statfs_i - Do a statfs
+ * @sdp: the filesystem
+ * @sg: the sg structure
+ *
+ * Returns: errno
+ */
+
+int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
+{
+	struct gfs2_statfs_change *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change *l_sc = &sdp->sd_statfs_local;
+
+	spin_lock(&sdp->sd_statfs_spin);
+
+	*sc = *m_sc;
+	sc->sc_total += l_sc->sc_total;
+	sc->sc_free += l_sc->sc_free;
+	sc->sc_dinodes += l_sc->sc_dinodes;
+
+	spin_unlock(&sdp->sd_statfs_spin);
+
+	if (sc->sc_free < 0)
+		sc->sc_free = 0;
+	if (sc->sc_free > sc->sc_total)
+		sc->sc_free = sc->sc_total;
+	if (sc->sc_dinodes < 0)
+		sc->sc_dinodes = 0;
+
+	return 0;
+}
+
+/**
+ * statfs_fill - fill in the sg for a given RG
+ * @rgd: the RG
+ * @sc: the sc structure
+ *
+ * Returns: 0 on success, -ESTALE if the LVB is invalid
+ */
+
+static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
+			    struct gfs2_statfs_change *sc)
+{
+	gfs2_rgrp_verify(rgd);
+	sc->sc_total += rgd->rd_ri.ri_data;
+	sc->sc_free += rgd->rd_rg.rg_free;
+	sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
+	return 0;
+}
+
+/**
+ * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
+ * @sdp: the filesystem
+ * @sc: the sc info that will be returned
+ *
+ * Any error (other than a signal) will cause this routine to fall back
+ * to the synchronous version.
+ *
+ * FIXME: This really shouldn't busy wait like this.
+ *
+ * Returns: errno
+ */
+
+int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc)
+{
+	struct gfs2_holder ri_gh;
+	struct gfs2_rgrpd *rgd_next;
+	struct gfs2_holder *gha, *gh;
+	unsigned int slots = 64;
+	unsigned int x;
+	int done;
+	int error = 0, err;
+
+	memset(sc, 0, sizeof(struct gfs2_statfs_change));
+	gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
+	if (!gha)
+		return -ENOMEM;
+
+	error = gfs2_rindex_hold(sdp, &ri_gh);
+	if (error)
+		goto out;
+
+	rgd_next = gfs2_rgrpd_get_first(sdp);
+
+	for (;;) {
+		done = 1;
+
+		for (x = 0; x < slots; x++) {
+			gh = gha + x;
+
+			if (gh->gh_gl && gfs2_glock_poll(gh)) {
+				err = gfs2_glock_wait(gh);
+				if (err) {
+					gfs2_holder_uninit(gh);
+					error = err;
+				} else {
+					if (!error)
+						error = statfs_slow_fill(
+							gh->gh_gl->gl_object, sc);
+					gfs2_glock_dq_uninit(gh);
+				}
+			}
+
+			if (gh->gh_gl)
+				done = 0;
+			else if (rgd_next && !error) {
+				error = gfs2_glock_nq_init(rgd_next->rd_gl,
+							   LM_ST_SHARED,
+							   GL_ASYNC,
+							   gh);
+				rgd_next = gfs2_rgrpd_get_next(rgd_next);
+				done = 0;
+			}
+
+			if (signal_pending(current))
+				error = -ERESTARTSYS;
+		}
+
+		if (done)
+			break;
+
+		yield();
+	}
+
+	gfs2_glock_dq_uninit(&ri_gh);
+
+out:
+	kfree(gha);
+	return error;
+}
+
+struct lfcc {
+	struct list_head list;
+	struct gfs2_holder gh;
+};
+
+/**
+ * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all
+ *                            journals are clean
+ * @sdp: the file system
+ * @state: the state to put the transaction lock into
+ * @t_gh: the hold on the transaction lock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
+				    struct gfs2_holder *t_gh)
+{
+	struct gfs2_inode *ip;
+	struct gfs2_holder ji_gh;
+	struct gfs2_jdesc *jd;
+	struct lfcc *lfcc;
+	LIST_HEAD(list);
+	struct gfs2_log_header lh;
+	int error;
+
+	error = gfs2_jindex_hold(sdp, &ji_gh);
+	if (error)
+		return error;
+
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
+		if (!lfcc) {
+			error = -ENOMEM;
+			goto out;
+		}
+		ip = GFS2_I(jd->jd_inode);
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh);
+		if (error) {
+			kfree(lfcc);
+			goto out;
+		}
+		list_add(&lfcc->list, &list);
+	}
+
+	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
+			       LM_FLAG_PRIORITY | GL_NOCACHE,
+			       t_gh);
+
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		error = gfs2_jdesc_check(jd);
+		if (error)
+			break;
+		error = gfs2_find_jhead(jd, &lh);
+		if (error)
+			break;
+		if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
+			error = -EBUSY;
+			break;
+		}
+	}
+
+	if (error)
+		gfs2_glock_dq_uninit(t_gh);
+
+out:
+	while (!list_empty(&list)) {
+		lfcc = list_entry(list.next, struct lfcc, list);
+		list_del(&lfcc->list);
+		gfs2_glock_dq_uninit(&lfcc->gh);
+		kfree(lfcc);
+	}
+	gfs2_glock_dq_uninit(&ji_gh);
+	return error;
+}
+
+/**
+ * gfs2_freeze_fs - freezes the file system
+ * @sdp: the file system
+ *
+ * This function flushes data and meta data for all machines by
+ * aquiring the transaction log exclusively.  All journals are
+ * ensured to be in a clean state as well.
+ *
+ * Returns: errno
+ */
+
+int gfs2_freeze_fs(struct gfs2_sbd *sdp)
+{
+	int error = 0;
+
+	mutex_lock(&sdp->sd_freeze_lock);
+
+	if (!sdp->sd_freeze_count++) {
+		error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
+		if (error)
+			sdp->sd_freeze_count--;
+	}
+
+	mutex_unlock(&sdp->sd_freeze_lock);
+
+	return error;
+}
+
+/**
+ * gfs2_unfreeze_fs - unfreezes the file system
+ * @sdp: the file system
+ *
+ * This function allows the file system to proceed by unlocking
+ * the exclusively held transaction lock.  Other GFS2 nodes are
+ * now free to acquire the lock shared and go on with their lives.
+ *
+ */
+
+void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
+{
+	mutex_lock(&sdp->sd_freeze_lock);
+
+	if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
+		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+
+	mutex_unlock(&sdp->sd_freeze_lock);
+}
+
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
new file mode 100644
index 000000000000..5bb443ae0f59
--- /dev/null
+++ b/fs/gfs2/super.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __SUPER_DOT_H__
+#define __SUPER_DOT_H__
+
+#include "incore.h"
+
+void gfs2_tune_init(struct gfs2_tune *gt);
+
+int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb *sb, int silent);
+int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
+struct page *gfs2_read_super(struct super_block *sb, sector_t sector);
+
+static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
+{
+	unsigned int x;
+	spin_lock(&sdp->sd_jindex_spin);
+	x = sdp->sd_journals;
+	spin_unlock(&sdp->sd_jindex_spin);
+	return x;
+}
+
+int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
+void gfs2_jindex_free(struct gfs2_sbd *sdp);
+
+struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
+void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
+struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
+int gfs2_jdesc_check(struct gfs2_jdesc *jd);
+
+int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
+			      struct gfs2_inode **ipp);
+
+int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
+int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
+
+int gfs2_statfs_init(struct gfs2_sbd *sdp);
+void gfs2_statfs_change(struct gfs2_sbd *sdp,
+			s64 total, s64 free, s64 dinodes);
+int gfs2_statfs_sync(struct gfs2_sbd *sdp);
+int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
+int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change *sc);
+
+int gfs2_freeze_fs(struct gfs2_sbd *sdp);
+void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
+
+#endif /* __SUPER_DOT_H__ */
+
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
new file mode 100644
index 000000000000..0e0ec988f731
--- /dev/null
+++ b/fs/gfs2/sys.c
@@ -0,0 +1,583 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "lm.h"
+#include "sys.h"
+#include "super.h"
+#include "glock.h"
+#include "quota.h"
+#include "util.h"
+
+char *gfs2_sys_margs;
+spinlock_t gfs2_sys_margs_lock;
+
+static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_vfs->s_id);
+}
+
+static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname);
+}
+
+static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
+{
+	unsigned int count;
+
+	mutex_lock(&sdp->sd_freeze_lock);
+	count = sdp->sd_freeze_count;
+	mutex_unlock(&sdp->sd_freeze_lock);
+
+	return snprintf(buf, PAGE_SIZE, "%u\n", count);
+}
+
+static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	ssize_t ret = len;
+	int error = 0;
+	int n = simple_strtol(buf, NULL, 0);
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	switch (n) {
+	case 0:
+		gfs2_unfreeze_fs(sdp);
+		break;
+	case 1:
+		error = gfs2_freeze_fs(sdp);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (error)
+		fs_warn(sdp, "freeze %d error %d", n, error);
+
+	return ret;
+}
+
+static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
+{
+	unsigned int b = test_bit(SDF_SHUTDOWN, &sdp->sd_flags);
+	return snprintf(buf, PAGE_SIZE, "%u\n", b);
+}
+
+static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (simple_strtol(buf, NULL, 0) != 1)
+		return -EINVAL;
+
+	gfs2_lm_withdraw(sdp,
+		"GFS2: fsid=%s: withdrawing from cluster at user's request\n",
+		sdp->sd_fsname);
+	return len;
+}
+
+static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
+				 size_t len)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (simple_strtol(buf, NULL, 0) != 1)
+		return -EINVAL;
+
+	gfs2_statfs_sync(sdp);
+	return len;
+}
+
+static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (simple_strtol(buf, NULL, 0) != 1)
+		return -EINVAL;
+
+	gfs2_gl_hash_clear(sdp, NO_WAIT);
+	return len;
+}
+
+static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
+				size_t len)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (simple_strtol(buf, NULL, 0) != 1)
+		return -EINVAL;
+
+	gfs2_quota_sync(sdp);
+	return len;
+}
+
+static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf,
+					size_t len)
+{
+	u32 id;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	id = simple_strtoul(buf, NULL, 0);
+
+	gfs2_quota_refresh(sdp, 1, id);
+	return len;
+}
+
+static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
+					 size_t len)
+{
+	u32 id;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	id = simple_strtoul(buf, NULL, 0);
+
+	gfs2_quota_refresh(sdp, 0, id);
+	return len;
+}
+
+struct gfs2_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct gfs2_sbd *, char *);
+	ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
+};
+
+#define GFS2_ATTR(name, mode, show, store) \
+static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
+
+GFS2_ATTR(id,                  0444, id_show,       NULL);
+GFS2_ATTR(fsname,              0444, fsname_show,   NULL);
+GFS2_ATTR(freeze,              0644, freeze_show,   freeze_store);
+GFS2_ATTR(shrink,              0200, NULL,          shrink_store);
+GFS2_ATTR(withdraw,            0644, withdraw_show, withdraw_store);
+GFS2_ATTR(statfs_sync,         0200, NULL,          statfs_sync_store);
+GFS2_ATTR(quota_sync,          0200, NULL,          quota_sync_store);
+GFS2_ATTR(quota_refresh_user,  0200, NULL,          quota_refresh_user_store);
+GFS2_ATTR(quota_refresh_group, 0200, NULL,          quota_refresh_group_store);
+
+static struct attribute *gfs2_attrs[] = {
+	&gfs2_attr_id.attr,
+	&gfs2_attr_fsname.attr,
+	&gfs2_attr_freeze.attr,
+	&gfs2_attr_shrink.attr,
+	&gfs2_attr_withdraw.attr,
+	&gfs2_attr_statfs_sync.attr,
+	&gfs2_attr_quota_sync.attr,
+	&gfs2_attr_quota_refresh_user.attr,
+	&gfs2_attr_quota_refresh_group.attr,
+	NULL,
+};
+
+static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+	struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+	return a->show ? a->show(sdp, buf) : 0;
+}
+
+static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+	struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+	return a->store ? a->store(sdp, buf, len) : len;
+}
+
+static struct sysfs_ops gfs2_attr_ops = {
+	.show  = gfs2_attr_show,
+	.store = gfs2_attr_store,
+};
+
+static struct kobj_type gfs2_ktype = {
+	.default_attrs = gfs2_attrs,
+	.sysfs_ops     = &gfs2_attr_ops,
+};
+
+static struct kset gfs2_kset = {
+	.subsys = &fs_subsys,
+	.kobj   = {.name = "gfs2"},
+	.ktype  = &gfs2_ktype,
+};
+
+/*
+ * display struct lm_lockstruct fields
+ */
+
+struct lockstruct_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct gfs2_sbd *, char *);
+};
+
+#define LOCKSTRUCT_ATTR(name, fmt)                                          \
+static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
+{                                                                           \
+	return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \
+}                                                                           \
+static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
+
+LOCKSTRUCT_ATTR(jid,      "%u\n");
+LOCKSTRUCT_ATTR(first,    "%u\n");
+LOCKSTRUCT_ATTR(lvb_size, "%u\n");
+LOCKSTRUCT_ATTR(flags,    "%d\n");
+
+static struct attribute *lockstruct_attrs[] = {
+	&lockstruct_attr_jid.attr,
+	&lockstruct_attr_first.attr,
+	&lockstruct_attr_lvb_size.attr,
+	&lockstruct_attr_flags.attr,
+	NULL,
+};
+
+/*
+ * display struct gfs2_args fields
+ */
+
+struct args_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct gfs2_sbd *, char *);
+};
+
+#define ARGS_ATTR(name, fmt)                                                \
+static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
+{                                                                           \
+	return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name);       \
+}                                                                           \
+static struct args_attr args_attr_##name = __ATTR_RO(name)
+
+ARGS_ATTR(lockproto,       "%s\n");
+ARGS_ATTR(locktable,       "%s\n");
+ARGS_ATTR(hostdata,        "%s\n");
+ARGS_ATTR(spectator,       "%d\n");
+ARGS_ATTR(ignore_local_fs, "%d\n");
+ARGS_ATTR(localcaching,    "%d\n");
+ARGS_ATTR(localflocks,     "%d\n");
+ARGS_ATTR(debug,           "%d\n");
+ARGS_ATTR(upgrade,         "%d\n");
+ARGS_ATTR(num_glockd,      "%u\n");
+ARGS_ATTR(posix_acl,       "%d\n");
+ARGS_ATTR(quota,           "%u\n");
+ARGS_ATTR(suiddir,         "%d\n");
+ARGS_ATTR(data,            "%d\n");
+
+/* one oddball doesn't fit the macro mold */
+static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n",
+			!!test_bit(SDF_NOATIME, &sdp->sd_flags));
+}
+static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
+
+static struct attribute *args_attrs[] = {
+	&args_attr_lockproto.attr,
+	&args_attr_locktable.attr,
+	&args_attr_hostdata.attr,
+	&args_attr_spectator.attr,
+	&args_attr_ignore_local_fs.attr,
+	&args_attr_localcaching.attr,
+	&args_attr_localflocks.attr,
+	&args_attr_debug.attr,
+	&args_attr_upgrade.attr,
+	&args_attr_num_glockd.attr,
+	&args_attr_posix_acl.attr,
+	&args_attr_quota.attr,
+	&args_attr_suiddir.attr,
+	&args_attr_data.attr,
+	&args_attr_noatime.attr,
+	NULL,
+};
+
+/*
+ * display counters from superblock
+ */
+
+struct counters_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct gfs2_sbd *, char *);
+};
+
+#define COUNTERS_ATTR(name, fmt)                                            \
+static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
+{                                                                           \
+	return snprintf(buf, PAGE_SIZE, fmt,                                \
+			(unsigned int)atomic_read(&sdp->sd_##name));        \
+}                                                                           \
+static struct counters_attr counters_attr_##name = __ATTR_RO(name)
+
+COUNTERS_ATTR(glock_count,      "%u\n");
+COUNTERS_ATTR(glock_held_count, "%u\n");
+COUNTERS_ATTR(inode_count,      "%u\n");
+COUNTERS_ATTR(reclaimed,        "%u\n");
+
+static struct attribute *counters_attrs[] = {
+	&counters_attr_glock_count.attr,
+	&counters_attr_glock_held_count.attr,
+	&counters_attr_inode_count.attr,
+	&counters_attr_reclaimed.attr,
+	NULL,
+};
+
+/*
+ * get and set struct gfs2_tune fields
+ */
+
+static ssize_t quota_scale_show(struct gfs2_sbd *sdp, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u %u\n",
+			sdp->sd_tune.gt_quota_scale_num,
+			sdp->sd_tune.gt_quota_scale_den);
+}
+
+static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf,
+				 size_t len)
+{
+	struct gfs2_tune *gt = &sdp->sd_tune;
+	unsigned int x, y;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (sscanf(buf, "%u %u", &x, &y) != 2 || !y)
+		return -EINVAL;
+
+	spin_lock(&gt->gt_spin);
+	gt->gt_quota_scale_num = x;
+	gt->gt_quota_scale_den = y;
+	spin_unlock(&gt->gt_spin);
+	return len;
+}
+
+static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
+			int check_zero, const char *buf, size_t len)
+{
+	struct gfs2_tune *gt = &sdp->sd_tune;
+	unsigned int x;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	x = simple_strtoul(buf, NULL, 0);
+
+	if (check_zero && !x)
+		return -EINVAL;
+
+	spin_lock(&gt->gt_spin);
+	*field = x;
+	spin_unlock(&gt->gt_spin);
+	return len;
+}
+
+struct tune_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct gfs2_sbd *, char *);
+	ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
+};
+
+#define TUNE_ATTR_3(name, show, store)                                        \
+static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store)
+
+#define TUNE_ATTR_2(name, store)                                              \
+static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                   \
+{                                                                             \
+	return snprintf(buf, PAGE_SIZE, "%u\n", sdp->sd_tune.gt_##name);      \
+}                                                                             \
+TUNE_ATTR_3(name, name##_show, store)
+
+#define TUNE_ATTR(name, check_zero)                                           \
+static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
+{                                                                             \
+	return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len);  \
+}                                                                             \
+TUNE_ATTR_2(name, name##_store)
+
+#define TUNE_ATTR_DAEMON(name, process)                                       \
+static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
+{                                                                             \
+	ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len);      \
+	wake_up_process(sdp->sd_##process);                                   \
+	return r;                                                             \
+}                                                                             \
+TUNE_ATTR_2(name, name##_store)
+
+TUNE_ATTR(ilimit, 0);
+TUNE_ATTR(ilimit_tries, 0);
+TUNE_ATTR(ilimit_min, 0);
+TUNE_ATTR(demote_secs, 0);
+TUNE_ATTR(incore_log_blocks, 0);
+TUNE_ATTR(log_flush_secs, 0);
+TUNE_ATTR(jindex_refresh_secs, 0);
+TUNE_ATTR(quota_warn_period, 0);
+TUNE_ATTR(quota_quantum, 0);
+TUNE_ATTR(atime_quantum, 0);
+TUNE_ATTR(max_readahead, 0);
+TUNE_ATTR(complain_secs, 0);
+TUNE_ATTR(reclaim_limit, 0);
+TUNE_ATTR(prefetch_secs, 0);
+TUNE_ATTR(statfs_slow, 0);
+TUNE_ATTR(new_files_jdata, 0);
+TUNE_ATTR(new_files_directio, 0);
+TUNE_ATTR(quota_simul_sync, 1);
+TUNE_ATTR(quota_cache_secs, 1);
+TUNE_ATTR(max_atomic_write, 1);
+TUNE_ATTR(stall_secs, 1);
+TUNE_ATTR(entries_per_readdir, 1);
+TUNE_ATTR(greedy_default, 1);
+TUNE_ATTR(greedy_quantum, 1);
+TUNE_ATTR(greedy_max, 1);
+TUNE_ATTR(statfs_quantum, 1);
+TUNE_ATTR_DAEMON(scand_secs, scand_process);
+TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
+TUNE_ATTR_DAEMON(logd_secs, logd_process);
+TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
+TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
+
+static struct attribute *tune_attrs[] = {
+	&tune_attr_ilimit.attr,
+	&tune_attr_ilimit_tries.attr,
+	&tune_attr_ilimit_min.attr,
+	&tune_attr_demote_secs.attr,
+	&tune_attr_incore_log_blocks.attr,
+	&tune_attr_log_flush_secs.attr,
+	&tune_attr_jindex_refresh_secs.attr,
+	&tune_attr_quota_warn_period.attr,
+	&tune_attr_quota_quantum.attr,
+	&tune_attr_atime_quantum.attr,
+	&tune_attr_max_readahead.attr,
+	&tune_attr_complain_secs.attr,
+	&tune_attr_reclaim_limit.attr,
+	&tune_attr_prefetch_secs.attr,
+	&tune_attr_statfs_slow.attr,
+	&tune_attr_quota_simul_sync.attr,
+	&tune_attr_quota_cache_secs.attr,
+	&tune_attr_max_atomic_write.attr,
+	&tune_attr_stall_secs.attr,
+	&tune_attr_entries_per_readdir.attr,
+	&tune_attr_greedy_default.attr,
+	&tune_attr_greedy_quantum.attr,
+	&tune_attr_greedy_max.attr,
+	&tune_attr_statfs_quantum.attr,
+	&tune_attr_scand_secs.attr,
+	&tune_attr_recoverd_secs.attr,
+	&tune_attr_logd_secs.attr,
+	&tune_attr_quotad_secs.attr,
+	&tune_attr_quota_scale.attr,
+	&tune_attr_new_files_jdata.attr,
+	&tune_attr_new_files_directio.attr,
+	NULL,
+};
+
+static struct attribute_group lockstruct_group = {
+	.name = "lockstruct",
+	.attrs = lockstruct_attrs,
+};
+
+static struct attribute_group counters_group = {
+	.name = "counters",
+	.attrs = counters_attrs,
+};
+
+static struct attribute_group args_group = {
+	.name = "args",
+	.attrs = args_attrs,
+};
+
+static struct attribute_group tune_group = {
+	.name = "tune",
+	.attrs = tune_attrs,
+};
+
+int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
+{
+	int error;
+
+	sdp->sd_kobj.kset = &gfs2_kset;
+	sdp->sd_kobj.ktype = &gfs2_ktype;
+
+	error = kobject_set_name(&sdp->sd_kobj, "%s", sdp->sd_table_name);
+	if (error)
+		goto fail;
+
+	error = kobject_register(&sdp->sd_kobj);
+	if (error)
+		goto fail;
+
+	error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
+	if (error)
+		goto fail_reg;
+
+	error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
+	if (error)
+		goto fail_lockstruct;
+
+	error = sysfs_create_group(&sdp->sd_kobj, &args_group);
+	if (error)
+		goto fail_counters;
+
+	error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
+	if (error)
+		goto fail_args;
+
+	return 0;
+
+fail_args:
+	sysfs_remove_group(&sdp->sd_kobj, &args_group);
+fail_counters:
+	sysfs_remove_group(&sdp->sd_kobj, &counters_group);
+fail_lockstruct:
+	sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
+fail_reg:
+	kobject_unregister(&sdp->sd_kobj);
+fail:
+	fs_err(sdp, "error %d adding sysfs files", error);
+	return error;
+}
+
+void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
+{
+	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
+	sysfs_remove_group(&sdp->sd_kobj, &args_group);
+	sysfs_remove_group(&sdp->sd_kobj, &counters_group);
+	sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
+	kobject_unregister(&sdp->sd_kobj);
+}
+
+int gfs2_sys_init(void)
+{
+	gfs2_sys_margs = NULL;
+	spin_lock_init(&gfs2_sys_margs_lock);
+	return kset_register(&gfs2_kset);
+}
+
+void gfs2_sys_uninit(void)
+{
+	kfree(gfs2_sys_margs);
+	kset_unregister(&gfs2_kset);
+}
+
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
new file mode 100644
index 000000000000..1ca8cdac5304
--- /dev/null
+++ b/fs/gfs2/sys.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __SYS_DOT_H__
+#define __SYS_DOT_H__
+
+#include <linux/spinlock.h>
+struct gfs2_sbd;
+
+/* Allow args to be passed to GFS2 when using an initial ram disk */
+extern char *gfs2_sys_margs;
+extern spinlock_t gfs2_sys_margs_lock;
+
+int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
+void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
+
+int gfs2_sys_init(void);
+void gfs2_sys_uninit(void);
+
+#endif /* __SYS_DOT_H__ */
+
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
new file mode 100644
index 000000000000..f8dabf8446bb
--- /dev/null
+++ b/fs/gfs2/trans.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/kallsyms.h>
+#include <linux/lm_interface.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "log.h"
+#include "lops.h"
+#include "meta_io.h"
+#include "trans.h"
+#include "util.h"
+
+int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+		     unsigned int revokes)
+{
+	struct gfs2_trans *tr;
+	int error;
+
+	BUG_ON(current->journal_info);
+	BUG_ON(blocks == 0 && revokes == 0);
+
+	tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
+	if (!tr)
+		return -ENOMEM;
+
+	tr->tr_ip = (unsigned long)__builtin_return_address(0);
+	tr->tr_blocks = blocks;
+	tr->tr_revokes = revokes;
+	tr->tr_reserved = 1;
+	if (blocks)
+		tr->tr_reserved += 6 + blocks;
+	if (revokes)
+		tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
+						   sizeof(u64));
+	INIT_LIST_HEAD(&tr->tr_list_buf);
+
+	gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
+
+	error = gfs2_glock_nq(&tr->tr_t_gh);
+	if (error)
+		goto fail_holder_uninit;
+
+	if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+		tr->tr_t_gh.gh_flags |= GL_NOCACHE;
+		error = -EROFS;
+		goto fail_gunlock;
+	}
+
+	error = gfs2_log_reserve(sdp, tr->tr_reserved);
+	if (error)
+		goto fail_gunlock;
+
+	current->journal_info = tr;
+
+	return 0;
+
+fail_gunlock:
+	gfs2_glock_dq(&tr->tr_t_gh);
+
+fail_holder_uninit:
+	gfs2_holder_uninit(&tr->tr_t_gh);
+	kfree(tr);
+
+	return error;
+}
+
+void gfs2_trans_end(struct gfs2_sbd *sdp)
+{
+	struct gfs2_trans *tr = current->journal_info;
+
+	BUG_ON(!tr);
+	current->journal_info = NULL;
+
+	if (!tr->tr_touched) {
+		gfs2_log_release(sdp, tr->tr_reserved);
+		gfs2_glock_dq(&tr->tr_t_gh);
+		gfs2_holder_uninit(&tr->tr_t_gh);
+		kfree(tr);
+		return;
+	}
+
+	if (gfs2_assert_withdraw(sdp, tr->tr_num_buf <= tr->tr_blocks)) {
+		fs_err(sdp, "tr_num_buf = %u, tr_blocks = %u ",
+		       tr->tr_num_buf, tr->tr_blocks);
+		print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
+	}
+	if (gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) {
+		fs_err(sdp, "tr_num_revoke = %u, tr_revokes = %u ",
+		       tr->tr_num_revoke, tr->tr_revokes);
+		print_symbol(KERN_WARNING "GFS2: Transaction created at: %s\n", tr->tr_ip);
+	}
+
+	gfs2_log_commit(sdp, tr);
+        gfs2_glock_dq(&tr->tr_t_gh);
+        gfs2_holder_uninit(&tr->tr_t_gh);
+        kfree(tr);
+
+	if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
+		gfs2_log_flush(sdp, NULL);
+}
+
+void gfs2_trans_add_gl(struct gfs2_glock *gl)
+{
+	lops_add(gl->gl_sbd, &gl->gl_le);
+}
+
+/**
+ * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
+ * @gl: the glock the buffer belongs to
+ * @bh: The buffer to add
+ * @meta: True in the case of adding metadata
+ *
+ */
+
+void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
+{
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct gfs2_bufdata *bd;
+
+	bd = bh->b_private;
+	if (bd)
+		gfs2_assert(sdp, bd->bd_gl == gl);
+	else {
+		gfs2_attach_bufdata(gl, bh, meta);
+		bd = bh->b_private;
+	}
+	lops_add(sdp, &bd->bd_le);
+}
+
+void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno)
+{
+	struct gfs2_revoke *rv = kmalloc(sizeof(struct gfs2_revoke),
+					 GFP_NOFS | __GFP_NOFAIL);
+	lops_init_le(&rv->rv_le, &gfs2_revoke_lops);
+	rv->rv_blkno = blkno;
+	lops_add(sdp, &rv->rv_le);
+}
+
+void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno)
+{
+	struct gfs2_revoke *rv;
+	int found = 0;
+
+	gfs2_log_lock(sdp);
+
+	list_for_each_entry(rv, &sdp->sd_log_le_revoke, rv_le.le_list) {
+		if (rv->rv_blkno == blkno) {
+			list_del(&rv->rv_le.le_list);
+			gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
+			sdp->sd_log_num_revoke--;
+			found = 1;
+			break;
+		}
+	}
+
+	gfs2_log_unlock(sdp);
+
+	if (found) {
+		struct gfs2_trans *tr = current->journal_info;
+		kfree(rv);
+		tr->tr_num_revoke_rm++;
+	}
+}
+
+void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
+{
+	lops_add(rgd->rd_sbd, &rgd->rd_le);
+}
+
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
new file mode 100644
index 000000000000..23d4cbe1de5b
--- /dev/null
+++ b/fs/gfs2/trans.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __TRANS_DOT_H__
+#define __TRANS_DOT_H__
+
+#include <linux/buffer_head.h>
+struct gfs2_sbd;
+struct gfs2_rgrpd;
+struct gfs2_glock;
+
+#define RES_DINODE	1
+#define RES_INDIRECT	1
+#define RES_JDATA	1
+#define RES_DATA	1
+#define RES_LEAF	1
+#define RES_RG_BIT	2
+#define RES_EATTR	1
+#define RES_STATFS	1
+#define RES_QUOTA	2
+
+int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
+		     unsigned int revokes);
+
+void gfs2_trans_end(struct gfs2_sbd *sdp);
+
+void gfs2_trans_add_gl(struct gfs2_glock *gl);
+void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
+void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno);
+void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno);
+void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
+
+#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
new file mode 100644
index 000000000000..196c604faadc
--- /dev/null
+++ b/fs/gfs2/util.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/crc32.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/lm_interface.h>
+#include <asm/uaccess.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "glock.h"
+#include "lm.h"
+#include "util.h"
+
+kmem_cache_t *gfs2_glock_cachep __read_mostly;
+kmem_cache_t *gfs2_inode_cachep __read_mostly;
+kmem_cache_t *gfs2_bufdata_cachep __read_mostly;
+
+void gfs2_assert_i(struct gfs2_sbd *sdp)
+{
+	printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n",
+	       sdp->sd_fsname);
+}
+
+/**
+ * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
+ * Returns: -1 if this call withdrew the machine,
+ *          -2 if it was already withdrawn
+ */
+
+int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
+			   const char *function, char *file, unsigned int line)
+{
+	int me;
+	me = gfs2_lm_withdraw(sdp,
+		"GFS2: fsid=%s: fatal: assertion \"%s\" failed\n"
+		"GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		sdp->sd_fsname, assertion,
+		sdp->sd_fsname, function, file, line);
+	dump_stack();
+	return (me) ? -1 : -2;
+}
+
+/**
+ * gfs2_assert_warn_i - Print a message to the console if @assertion is false
+ * Returns: -1 if we printed something
+ *          -2 if we didn't
+ */
+
+int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
+		       const char *function, char *file, unsigned int line)
+{
+	if (time_before(jiffies,
+			sdp->sd_last_warning +
+			gfs2_tune_get(sdp, gt_complain_secs) * HZ))
+		return -2;
+
+	printk(KERN_WARNING
+	       "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+	       "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+	       sdp->sd_fsname, assertion,
+	       sdp->sd_fsname, function, file, line);
+
+	if (sdp->sd_args.ar_debug)
+		BUG();
+	else
+		dump_stack();
+
+	sdp->sd_last_warning = jiffies;
+
+	return -1;
+}
+
+/**
+ * gfs2_consist_i - Flag a filesystem consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+
+int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
+		   char *file, unsigned int line)
+{
+	int rv;
+	rv = gfs2_lm_withdraw(sdp,
+		"GFS2: fsid=%s: fatal: filesystem consistency error\n"
+		"GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		sdp->sd_fsname,
+		sdp->sd_fsname, function, file, line);
+	return rv;
+}
+
+/**
+ * gfs2_consist_inode_i - Flag an inode consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+
+int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
+			 const char *function, char *file, unsigned int line)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	int rv;
+	rv = gfs2_lm_withdraw(sdp,
+		"GFS2: fsid=%s: fatal: filesystem consistency error\n"
+		"GFS2: fsid=%s:   inode = %llu %llu\n"
+		"GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		sdp->sd_fsname,
+		sdp->sd_fsname, (unsigned long long)ip->i_num.no_formal_ino,
+		(unsigned long long)ip->i_num.no_addr,
+		sdp->sd_fsname, function, file, line);
+	return rv;
+}
+
+/**
+ * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+
+int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
+			 const char *function, char *file, unsigned int line)
+{
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
+	int rv;
+	rv = gfs2_lm_withdraw(sdp,
+		"GFS2: fsid=%s: fatal: filesystem consistency error\n"
+		"GFS2: fsid=%s:   RG = %llu\n"
+		"GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		sdp->sd_fsname,
+		sdp->sd_fsname, (unsigned long long)rgd->rd_ri.ri_addr,
+		sdp->sd_fsname, function, file, line);
+	return rv;
+}
+
+/**
+ * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          -2 if it was already withdrawn
+ */
+
+int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+		       const char *type, const char *function, char *file,
+		       unsigned int line)
+{
+	int me;
+	me = gfs2_lm_withdraw(sdp,
+		"GFS2: fsid=%s: fatal: invalid metadata block\n"
+		"GFS2: fsid=%s:   bh = %llu (%s)\n"
+		"GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		sdp->sd_fsname,
+		sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type,
+		sdp->sd_fsname, function, file, line);
+	return (me) ? -1 : -2;
+}
+
+/**
+ * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          -2 if it was already withdrawn
+ */
+
+int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+			   u16 type, u16 t, const char *function,
+			   char *file, unsigned int line)
+{
+	int me;
+	me = gfs2_lm_withdraw(sdp,
+		"GFS2: fsid=%s: fatal: invalid metadata block\n"
+		"GFS2: fsid=%s:   bh = %llu (type: exp=%u, found=%u)\n"
+		"GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		sdp->sd_fsname,
+		sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t,
+		sdp->sd_fsname, function, file, line);
+	return (me) ? -1 : -2;
+}
+
+/**
+ * gfs2_io_error_i - Flag an I/O error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+
+int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
+		    unsigned int line)
+{
+	int rv;
+	rv = gfs2_lm_withdraw(sdp,
+		"GFS2: fsid=%s: fatal: I/O error\n"
+		"GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		sdp->sd_fsname,
+		sdp->sd_fsname, function, file, line);
+	return rv;
+}
+
+/**
+ * gfs2_io_error_bh_i - Flag a buffer I/O error and withdraw
+ * Returns: -1 if this call withdrew the machine,
+ *          0 if it was already withdrawn
+ */
+
+int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
+		       const char *function, char *file, unsigned int line)
+{
+	int rv;
+	rv = gfs2_lm_withdraw(sdp,
+		"GFS2: fsid=%s: fatal: I/O error\n"
+		"GFS2: fsid=%s:   block = %llu\n"
+		"GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+		sdp->sd_fsname,
+		sdp->sd_fsname, (unsigned long long)bh->b_blocknr,
+		sdp->sd_fsname, function, file, line);
+	return rv;
+}
+
+void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
+		      unsigned int bit, int new_value)
+{
+	unsigned int c, o, b = bit;
+	int old_value;
+
+	c = b / (8 * PAGE_SIZE);
+	b %= 8 * PAGE_SIZE;
+	o = b / 8;
+	b %= 8;
+
+	old_value = (bitmap[c][o] & (1 << b));
+	gfs2_assert_withdraw(sdp, !old_value != !new_value);
+
+	if (new_value)
+		bitmap[c][o] |= 1 << b;
+	else
+		bitmap[c][o] &= ~(1 << b);
+}
+
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
new file mode 100644
index 000000000000..76a50899fe9e
--- /dev/null
+++ b/fs/gfs2/util.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __UTIL_DOT_H__
+#define __UTIL_DOT_H__
+
+#include "incore.h"
+
+#define fs_printk(level, fs, fmt, arg...) \
+	printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg)
+
+#define fs_info(fs, fmt, arg...) \
+	fs_printk(KERN_INFO , fs , fmt , ## arg)
+
+#define fs_warn(fs, fmt, arg...) \
+	fs_printk(KERN_WARNING , fs , fmt , ## arg)
+
+#define fs_err(fs, fmt, arg...) \
+	fs_printk(KERN_ERR, fs , fmt , ## arg)
+
+
+void gfs2_assert_i(struct gfs2_sbd *sdp);
+
+#define gfs2_assert(sdp, assertion) \
+do { \
+	if (unlikely(!(assertion))) { \
+		gfs2_assert_i(sdp); \
+		BUG(); \
+        } \
+} while (0)
+
+
+int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
+			   const char *function, char *file, unsigned int line);
+
+#define gfs2_assert_withdraw(sdp, assertion) \
+((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \
+					__FUNCTION__, __FILE__, __LINE__))
+
+
+int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
+		       const char *function, char *file, unsigned int line);
+
+#define gfs2_assert_warn(sdp, assertion) \
+((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \
+					__FUNCTION__, __FILE__, __LINE__))
+
+
+int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide,
+		   const char *function, char *file, unsigned int line);
+
+#define gfs2_consist(sdp) \
+gfs2_consist_i((sdp), 0, __FUNCTION__, __FILE__, __LINE__)
+
+
+int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
+			 const char *function, char *file, unsigned int line);
+
+#define gfs2_consist_inode(ip) \
+gfs2_consist_inode_i((ip), 0, __FUNCTION__, __FILE__, __LINE__)
+
+
+int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
+			 const char *function, char *file, unsigned int line);
+
+#define gfs2_consist_rgrpd(rgd) \
+gfs2_consist_rgrpd_i((rgd), 0, __FUNCTION__, __FILE__, __LINE__)
+
+
+int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+		       const char *type, const char *function,
+		       char *file, unsigned int line);
+
+static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp,
+				    struct buffer_head *bh,
+				    const char *function,
+				    char *file, unsigned int line)
+{
+	struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
+	u32 magic = mh->mh_magic;
+	magic = be32_to_cpu(magic);
+	if (unlikely(magic != GFS2_MAGIC))
+		return gfs2_meta_check_ii(sdp, bh, "magic number", function,
+					  file, line);
+	return 0;
+}
+
+#define gfs2_meta_check(sdp, bh) \
+gfs2_meta_check_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__)
+
+
+int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
+			   u16 type, u16 t,
+			   const char *function,
+			   char *file, unsigned int line);
+
+static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp,
+					struct buffer_head *bh,
+					u16 type,
+					const char *function,
+					char *file, unsigned int line)
+{
+	struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
+	u32 magic = mh->mh_magic;
+	u16 t = be32_to_cpu(mh->mh_type);
+	magic = be32_to_cpu(magic);
+	if (unlikely(magic != GFS2_MAGIC))
+		return gfs2_meta_check_ii(sdp, bh, "magic number", function,
+					  file, line);
+        if (unlikely(t != type))
+		return gfs2_metatype_check_ii(sdp, bh, type, t, function,
+					      file, line);
+	return 0;
+}
+
+#define gfs2_metatype_check(sdp, bh, type) \
+gfs2_metatype_check_i((sdp), (bh), (type), __FUNCTION__, __FILE__, __LINE__)
+
+static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type,
+				     u16 format)
+{
+	struct gfs2_meta_header *mh;
+	mh = (struct gfs2_meta_header *)bh->b_data;
+	mh->mh_type = cpu_to_be32(type);
+	mh->mh_format = cpu_to_be32(format);
+}
+
+
+int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function,
+		    char *file, unsigned int line);
+
+#define gfs2_io_error(sdp) \
+gfs2_io_error_i((sdp), __FUNCTION__, __FILE__, __LINE__);
+
+
+int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
+		       const char *function, char *file, unsigned int line);
+
+#define gfs2_io_error_bh(sdp, bh) \
+gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__);
+
+
+extern kmem_cache_t *gfs2_glock_cachep;
+extern kmem_cache_t *gfs2_inode_cachep;
+extern kmem_cache_t *gfs2_bufdata_cachep;
+
+static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
+					   unsigned int *p)
+{
+	unsigned int x;
+	spin_lock(&gt->gt_spin);
+	x = *p;
+	spin_unlock(&gt->gt_spin);
+	return x;
+}
+
+#define gfs2_tune_get(sdp, field) \
+gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
+
+void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
+		      unsigned int bit, int new_value);
+
+#endif /* __UTIL_DOT_H__ */
+
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 13231dd5ce66..0d200068d0af 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -249,10 +249,9 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	sb = tree->inode->i_sb;
 	size = sizeof(struct hfs_bnode) + tree->pages_per_bnode *
 		sizeof(struct page *);
-	node = kmalloc(size, GFP_KERNEL);
+	node = kzalloc(size, GFP_KERNEL);
 	if (!node)
 		return NULL;
-	memset(node, 0, size);
 	node->tree = tree;
 	node->this = cnid;
 	set_bit(HFS_BNODE_NEW, &node->flags);
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 400357994319..5fd0ed71f923 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -21,10 +21,9 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	struct page *page;
 	unsigned int size;
 
-	tree = kmalloc(sizeof(*tree), GFP_KERNEL);
+	tree = kzalloc(sizeof(*tree), GFP_KERNEL);
 	if (!tree)
 		return NULL;
-	memset(tree, 0, sizeof(*tree));
 
 	init_MUTEX(&tree->tree_lock);
 	spin_lock_init(&tree->hash_lock);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 7cd8cc03aea7..37d681b4f216 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -246,7 +246,7 @@ static int hfs_unlink(struct inode *dir, struct dentry *dentry)
 	if (res)
 		return res;
 
-	inode->i_nlink--;
+	drop_nlink(inode);
 	hfs_delete_inode(inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
@@ -273,7 +273,7 @@ static int hfs_rmdir(struct inode *dir, struct dentry *dentry)
 	res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
 	if (res)
 		return res;
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	hfs_delete_inode(inode);
 	mark_inode_dirty(inode);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 315cf44a90b2..02f5573e0349 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -154,7 +154,6 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode)
 	inode->i_gid = current->fsgid;
 	inode->i_nlink = 1;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-	inode->i_blksize = HFS_SB(sb)->alloc_blksz;
 	HFS_I(inode)->flags = 0;
 	HFS_I(inode)->rsrc_inode = NULL;
 	HFS_I(inode)->fs_blocks = 0;
@@ -284,7 +283,6 @@ static int hfs_read_inode(struct inode *inode, void *data)
 	inode->i_uid = hsb->s_uid;
 	inode->i_gid = hsb->s_gid;
 	inode->i_nlink = 1;
-	inode->i_blksize = HFS_SB(inode->i_sb)->alloc_blksz;
 
 	if (idata->key)
 		HFS_I(inode)->cat_key = *idata->key;
@@ -603,8 +601,10 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
 
 static const struct file_operations hfs_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.sendfile	= generic_file_sendfile,
 	.fsync		= file_fsync,
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 34937ee83ab1..d43b4fcc8ad3 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -356,11 +356,10 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
 	struct inode *root_inode;
 	int res;
 
-	sbi = kmalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
+	sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
 	sb->s_fs_info = sbi;
-	memset(sbi, 0, sizeof(struct hfs_sb_info));
 	INIT_HLIST_HEAD(&sbi->rsrc_inodes);
 
 	res = -EINVAL;
@@ -455,8 +454,7 @@ static int __init init_hfs_fs(void)
 static void __exit exit_hfs_fs(void)
 {
 	unregister_filesystem(&hfs_fs_type);
-	if (kmem_cache_destroy(hfs_inode_cachep))
-		printk(KERN_ERR "hfs_inode_cache: not all structures were freed\n");
+	kmem_cache_destroy(hfs_inode_cachep);
 }
 
 module_init(init_hfs_fs)
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 77bf434da679..29da6574ba77 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -409,10 +409,9 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 	sb = tree->inode->i_sb;
 	size = sizeof(struct hfs_bnode) + tree->pages_per_bnode *
 		sizeof(struct page *);
-	node = kmalloc(size, GFP_KERNEL);
+	node = kzalloc(size, GFP_KERNEL);
 	if (!node)
 		return NULL;
-	memset(node, 0, size);
 	node->tree = tree;
 	node->this = cnid;
 	set_bit(HFS_BNODE_NEW, &node->flags);
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index cfc852fdd1b5..a9b9e872e29a 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -24,10 +24,9 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 	struct page *page;
 	unsigned int size;
 
-	tree = kmalloc(sizeof(*tree), GFP_KERNEL);
+	tree = kzalloc(sizeof(*tree), GFP_KERNEL);
 	if (!tree)
 		return NULL;
-	memset(tree, 0, sizeof(*tree));
 
 	init_MUTEX(&tree->tree_lock);
 	spin_lock_init(&tree->hash_lock);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 1f9ece0de326..7e309751645f 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -298,7 +298,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
 	if (res)
 		return res;
 
-	inode->i_nlink++;
+	inc_nlink(inode);
 	hfsplus_instantiate(dst_dentry, inode, cnid);
 	atomic_inc(&inode->i_count);
 	inode->i_ctime = CURRENT_TIME_SEC;
@@ -338,7 +338,7 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 		return res;
 
 	if (inode->i_nlink > 0)
-		inode->i_nlink--;
+		drop_nlink(inode);
 	hfsplus_delete_inode(inode);
 	if (inode->i_ino != cnid && !inode->i_nlink) {
 		if (!atomic_read(&HFSPLUS_I(inode).opencnt)) {
@@ -348,7 +348,7 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
 		} else
 			inode->i_flags |= S_DEAD;
 	} else
-		inode->i_nlink = 0;
+		clear_nlink(inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	mark_inode_dirty(inode);
 
@@ -387,7 +387,7 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
 	res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
 	if (res)
 		return res;
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	hfsplus_delete_inode(inode);
 	mark_inode_dirty(inode);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 8a1ca5ef7ada..3915635b4470 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -246,12 +246,8 @@ struct hfsplus_readdir_data {
 
 /* ext2 ioctls (EXT2_IOC_GETFLAGS and EXT2_IOC_SETFLAGS) to support
  * chattr/lsattr */
-#define HFSPLUS_IOC_EXT2_GETFLAGS	_IOR('f', 1, long)
-#define HFSPLUS_IOC_EXT2_SETFLAGS	_IOW('f', 2, long)
-
-#define EXT2_FLAG_IMMUTABLE		0x00000010 /* Immutable file */
-#define EXT2_FLAG_APPEND		0x00000020 /* writes to file may only append */
-#define EXT2_FLAG_NODUMP		0x00000040 /* do not dump file */
+#define HFSPLUS_IOC_EXT2_GETFLAGS	FS_IOC_GETFLAGS
+#define HFSPLUS_IOC_EXT2_SETFLAGS	FS_IOC_SETFLAGS
 
 
 /*
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 924ecdef8091..9e3675249633 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -282,8 +282,10 @@ static struct inode_operations hfsplus_file_inode_operations = {
 
 static const struct file_operations hfsplus_file_operations = {
 	.llseek 	= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.sendfile	= generic_file_sendfile,
 	.fsync		= file_fsync,
@@ -304,7 +306,6 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
 	inode->i_gid = current->fsgid;
 	inode->i_nlink = 1;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-	inode->i_blksize = HFSPLUS_SB(sb).alloc_blksz;
 	INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
 	init_MUTEX(&HFSPLUS_I(inode).extents_lock);
 	atomic_set(&HFSPLUS_I(inode).opencnt, 0);
@@ -407,7 +408,6 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 	type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
 
 	HFSPLUS_I(inode).dev = 0;
-	inode->i_blksize = HFSPLUS_SB(inode->i_sb).alloc_blksz;
 	if (type == HFSPLUS_FOLDER) {
 		struct hfsplus_cat_folder *folder = &entry.folder;
 
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 13cf848ac833..79fd10402ea3 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -28,11 +28,11 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 	case HFSPLUS_IOC_EXT2_GETFLAGS:
 		flags = 0;
 		if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_IMMUTABLE)
-			flags |= EXT2_FLAG_IMMUTABLE; /* EXT2_IMMUTABLE_FL */
+			flags |= FS_IMMUTABLE_FL; /* EXT2_IMMUTABLE_FL */
 		if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_APPEND)
-			flags |= EXT2_FLAG_APPEND; /* EXT2_APPEND_FL */
+			flags |= FS_APPEND_FL; /* EXT2_APPEND_FL */
 		if (HFSPLUS_I(inode).userflags & HFSPLUS_FLG_NODUMP)
-			flags |= EXT2_FLAG_NODUMP; /* EXT2_NODUMP_FL */
+			flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
 		return put_user(flags, (int __user *)arg);
 	case HFSPLUS_IOC_EXT2_SETFLAGS: {
 		if (IS_RDONLY(inode))
@@ -44,32 +44,31 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 		if (get_user(flags, (int __user *)arg))
 			return -EFAULT;
 
-		if (flags & (EXT2_FLAG_IMMUTABLE|EXT2_FLAG_APPEND) ||
+		if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
 		    HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
 			if (!capable(CAP_LINUX_IMMUTABLE))
 				return -EPERM;
 		}
 
 		/* don't silently ignore unsupported ext2 flags */
-		if (flags & ~(EXT2_FLAG_IMMUTABLE|EXT2_FLAG_APPEND|
-			      EXT2_FLAG_NODUMP))
+		if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL))
 			return -EOPNOTSUPP;
 
-		if (flags & EXT2_FLAG_IMMUTABLE) { /* EXT2_IMMUTABLE_FL */
+		if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */
 			inode->i_flags |= S_IMMUTABLE;
 			HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE;
 		} else {
 			inode->i_flags &= ~S_IMMUTABLE;
 			HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
 		}
-		if (flags & EXT2_FLAG_APPEND) { /* EXT2_APPEND_FL */
+		if (flags & FS_APPEND_FL) { /* EXT2_APPEND_FL */
 			inode->i_flags |= S_APPEND;
 			HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_APPEND;
 		} else {
 			inode->i_flags &= ~S_APPEND;
 			HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND;
 		}
-		if (flags & EXT2_FLAG_NODUMP) /* EXT2_NODUMP_FL */
+		if (flags & FS_NODUMP_FL) /* EXT2_NODUMP_FL */
 			HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP;
 		else
 			HFSPLUS_I(inode).userflags &= ~HFSPLUS_FLG_NODUMP;
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index ae783066fc3a..1528a6fd0299 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/hfs/part_tbl.c
+ * linux/fs/hfsplus/part_tbl.c
  *
  * Copyright (C) 1996-1997  Paul H. Hargrove
  * This file may be distributed under the terms of the GNU General Public License.
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index d279d5924f28..194eede52fa4 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -493,8 +493,7 @@ static int __init init_hfsplus_fs(void)
 static void __exit exit_hfsplus_fs(void)
 {
 	unregister_filesystem(&hfsplus_fs_type);
-	if (kmem_cache_destroy(hfsplus_inode_cachep))
-		printk(KERN_ERR "hfsplus_inode_cache: not all structures were freed\n");
+	kmem_cache_destroy(hfsplus_inode_cachep);
 }
 
 module_init(init_hfsplus_fs)
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index b82e3d9c8790..b6bd33ca3731 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -156,7 +156,6 @@ static int read_name(struct inode *ino, char *name)
 	ino->i_mode = i_mode;
 	ino->i_nlink = i_nlink;
 	ino->i_size = i_size;
-	ino->i_blksize = i_blksize;
 	ino->i_blocks = i_blocks;
 	return(0);
 }
@@ -386,13 +385,11 @@ int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 
 static const struct file_operations hostfs_file_fops = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
+	.read		= do_sync_read,
 	.sendfile	= generic_file_sendfile,
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= generic_file_aio_write,
-	.readv		= generic_file_readv,
-	.writev		= generic_file_writev,
-	.write		= generic_file_write,
+	.write		= do_sync_write,
 	.mmap		= generic_file_mmap,
 	.open		= hostfs_file_open,
 	.release	= NULL,
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index 2807aa833e62..b52b7381d10f 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -76,7 +76,7 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe
 		return NULL;
 	}
 
-	qbh->data = data = (char *)kmalloc(2048, GFP_NOFS);
+	qbh->data = data = kmalloc(2048, GFP_NOFS);
 	if (!data) {
 		printk("HPFS: hpfs_map_4sectors: out of memory\n");
 		goto bail;
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index d9eb19b7b8ae..8b94d24855f0 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -113,7 +113,7 @@ static ssize_t hpfs_file_write(struct file *file, const char __user *buf,
 {
 	ssize_t retval;
 
-	retval = generic_file_write(file, buf, count, ppos);
+	retval = do_sync_write(file, buf, count, ppos);
 	if (retval > 0)
 		hpfs_i(file->f_dentry->d_inode)->i_dirty = 1;
 	return retval;
@@ -122,8 +122,10 @@ static ssize_t hpfs_file_write(struct file *file, const char __user *buf,
 const struct file_operations hpfs_file_ops =
 {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
 	.write		= hpfs_file_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.release	= hpfs_file_release,
 	.fsync		= hpfs_file_fsync,
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 56f2c338c4d9..7faef8544f32 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -17,7 +17,6 @@ void hpfs_init_inode(struct inode *i)
 	i->i_gid = hpfs_sb(sb)->sb_gid;
 	i->i_mode = hpfs_sb(sb)->sb_mode;
 	hpfs_inode->i_conv = hpfs_sb(sb)->sb_conv;
-	i->i_blksize = 512;
 	i->i_size = -1;
 	i->i_blocks = -1;
 	
@@ -61,14 +60,14 @@ void hpfs_read_inode(struct inode *i)
 	if (hpfs_sb(i->i_sb)->sb_eas) {
 		if ((ea = hpfs_get_ea(i->i_sb, fnode, "UID", &ea_size))) {
 			if (ea_size == 2) {
-				i->i_uid = le16_to_cpu(*(u16*)ea);
+				i->i_uid = le16_to_cpu(*(__le16*)ea);
 				hpfs_inode->i_ea_uid = 1;
 			}
 			kfree(ea);
 		}
 		if ((ea = hpfs_get_ea(i->i_sb, fnode, "GID", &ea_size))) {
 			if (ea_size == 2) {
-				i->i_gid = le16_to_cpu(*(u16*)ea);
+				i->i_gid = le16_to_cpu(*(__le16*)ea);
 				hpfs_inode->i_ea_gid = 1;
 			}
 			kfree(ea);
@@ -88,7 +87,7 @@ void hpfs_read_inode(struct inode *i)
 			int rdev = 0;
 			umode_t mode = hpfs_sb(sb)->sb_mode;
 			if (ea_size == 2) {
-				mode = le16_to_cpu(*(u16*)ea);
+				mode = le16_to_cpu(*(__le16*)ea);
 				hpfs_inode->i_ea_mode = 1;
 			}
 			kfree(ea);
@@ -96,7 +95,7 @@ void hpfs_read_inode(struct inode *i)
 			if (S_ISBLK(mode) || S_ISCHR(mode)) {
 				if ((ea = hpfs_get_ea(i->i_sb, fnode, "DEV", &ea_size))) {
 					if (ea_size == 4)
-						rdev = le32_to_cpu(*(u32*)ea);
+						rdev = le32_to_cpu(*(__le32*)ea);
 					kfree(ea);
 				}
 			}
@@ -149,7 +148,7 @@ static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode)
 		   we'd better not overwrite them
 		hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 stuctures", i->i_ino);
 	} else*/ if (hpfs_sb(i->i_sb)->sb_eas >= 2) {
-		u32 ea;
+		__le32 ea;
 		if ((i->i_uid != hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) {
 			ea = cpu_to_le32(i->i_uid);
 			hpfs_set_ea(i, fnode, "UID", (char*)&ea, 2);
@@ -166,6 +165,7 @@ static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode)
 			  && i->i_mode != ((hpfs_sb(i->i_sb)->sb_mode & ~(S_ISDIR(i->i_mode) ? 0222 : 0333))
 			  | (S_ISDIR(i->i_mode) ? S_IFDIR : S_IFREG))) || hpfs_inode->i_ea_mode) {
 				ea = cpu_to_le32(i->i_mode);
+				/* sick, but legal */
 				hpfs_set_ea(i, fnode, "MODE", (char *)&ea, 2);
 				hpfs_inode->i_ea_mode = 1;
 			}
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 59e7dc182a0c..2507e7393f3c 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -89,7 +89,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	brelse(bh);
 	hpfs_mark_4buffers_dirty(&qbh0);
 	hpfs_brelse4(&qbh0);
-	dir->i_nlink++;
+	inc_nlink(dir);
 	insert_inode_hash(result);
 
 	if (result->i_uid != current->fsuid ||
@@ -434,7 +434,7 @@ again:
 		unlock_kernel();
 		return -ENOSPC;
 	default:
-		inode->i_nlink--;
+		drop_nlink(inode);
 		err = 0;
 	}
 	goto out;
@@ -494,8 +494,8 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
 		err = -ENOSPC;
 		break;
 	default:
-		dir->i_nlink--;
-		inode->i_nlink = 0;
+		drop_nlink(dir);
+		clear_nlink(inode);
 		err = 0;
 	}
 	goto out;
@@ -590,7 +590,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		int r;
 		if ((r = hpfs_remove_dirent(old_dir, dno, dep, &qbh, 1)) != 2) {
 			if ((nde = map_dirent(new_dir, hpfs_i(new_dir)->i_dno, (char *)new_name, new_len, NULL, &qbh1))) {
-				new_inode->i_nlink = 0;
+				clear_nlink(new_inode);
 				copy_de(nde, &de);
 				memcpy(nde->name, new_name, new_len);
 				hpfs_mark_4buffers_dirty(&qbh1);
@@ -635,8 +635,8 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	end:
 	hpfs_i(i)->i_parent_dir = new_dir->i_ino;
 	if (S_ISDIR(i->i_mode)) {
-		new_dir->i_nlink++;
-		old_dir->i_nlink--;
+		inc_nlink(new_dir);
+		drop_nlink(old_dir);
 	}
 	if ((fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh))) {
 		fnode->up = new_dir->i_ino;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 8fe51c343786..450b5e0b4785 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -203,8 +203,7 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(hpfs_inode_cachep))
-		printk(KERN_INFO "hpfs_inode_cache: not all structures were freed\n");
+	kmem_cache_destroy(hpfs_inode_cachep);
 }
 
 /*
@@ -462,11 +461,10 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 
 	int o;
 
-	sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
 	s->s_fs_info = sbi;
-	memset(sbi, 0, sizeof(*sbi));
 
 	sbi->sb_bmp_dir = NULL;
 	sbi->sb_cp_table = NULL;
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c
index 3a9bdf58166f..642675fc394a 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs_kern.c
@@ -152,7 +152,6 @@ static void hppfs_read_inode(struct inode *ino)
 	ino->i_mode = proc_ino->i_mode;
 	ino->i_nlink = proc_ino->i_nlink;
 	ino->i_size = proc_ino->i_size;
-	ino->i_blksize = proc_ino->i_blksize;
 	ino->i_blocks = proc_ino->i_blocks;
 }
 
@@ -573,7 +572,7 @@ struct hppfs_dirent {
 };
 
 static int hppfs_filldir(void *d, const char *name, int size,
-			 loff_t offset, ino_t inode, unsigned int type)
+			 loff_t offset, u64 inode, unsigned int type)
 {
 	struct hppfs_dirent *dirent = d;
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c3920c96dadf..4ee3f006b861 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -229,7 +229,7 @@ static void hugetlbfs_delete_inode(struct inode *inode)
 	clear_inode(inode);
 }
 
-static void hugetlbfs_forget_inode(struct inode *inode)
+static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock)
 {
 	struct super_block *sb = inode->i_sb;
 
@@ -293,7 +293,7 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff)
 		if (h_vm_pgoff >= h_pgoff)
 			v_offset = 0;
 
-		unmap_hugepage_range(vma,
+		__unmap_hugepage_range(vma,
 				vma->vm_start + v_offset, vma->vm_end);
 	}
 }
@@ -357,7 +357,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 		inode->i_mode = mode;
 		inode->i_uid = uid;
 		inode->i_gid = gid;
-		inode->i_blksize = HPAGE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
 		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
@@ -378,7 +377,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 			inode->i_fop = &simple_dir_operations;
 
 			/* directory inodes start off with i_nlink == 2 (for "." entry) */
-			inode->i_nlink++;
+			inc_nlink(inode);
 			break;
 		case S_IFLNK:
 			inode->i_op = &page_symlink_inode_operations;
@@ -419,7 +418,7 @@ static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
 	if (!retval)
-		dir->i_nlink++;
+		inc_nlink(dir);
 	return retval;
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index 0bf9f0444a96..26cdb115ce67 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -133,7 +133,6 @@ static struct inode *alloc_inode(struct super_block *sb)
 		inode->i_bdev = NULL;
 		inode->i_cdev = NULL;
 		inode->i_rdev = 0;
-		inode->i_security = NULL;
 		inode->dirtied_when = 0;
 		if (security_inode_alloc(inode)) {
 			if (inode->i_sb->s_op->destroy_inode)
@@ -163,7 +162,7 @@ static struct inode *alloc_inode(struct super_block *sb)
 				bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
 			mapping->backing_dev_info = bdi;
 		}
-		memset(&inode->u, 0, sizeof(inode->u));
+		inode->i_private = NULL;
 		inode->i_mapping = mapping;
 	}
 	return inode;
@@ -254,9 +253,9 @@ void clear_inode(struct inode *inode)
 	DQUOT_DROP(inode);
 	if (inode->i_sb && inode->i_sb->s_op->clear_inode)
 		inode->i_sb->s_op->clear_inode(inode);
-	if (inode->i_bdev)
+	if (S_ISBLK(inode->i_mode) && inode->i_bdev)
 		bd_forget(inode);
-	if (inode->i_cdev)
+	if (S_ISCHR(inode->i_mode) && inode->i_cdev)
 		cd_forget(inode);
 	inode->i_state = I_CLEAR;
 }
@@ -363,27 +362,6 @@ int invalidate_inodes(struct super_block * sb)
 }
 
 EXPORT_SYMBOL(invalidate_inodes);
- 
-int __invalidate_device(struct block_device *bdev)
-{
-	struct super_block *sb = get_super(bdev);
-	int res = 0;
-
-	if (sb) {
-		/*
-		 * no need to lock the super, get_super holds the
-		 * read mutex so the filesystem cannot go away
-		 * under us (->put_super runs with the write lock
-		 * hold).
-		 */
-		shrink_dcache_sb(sb);
-		res = invalidate_inodes(sb);
-		drop_super(sb);
-	}
-	invalidate_bdev(bdev, 0);
-	return res;
-}
-EXPORT_SYMBOL(__invalidate_device);
 
 static int can_unuse(struct inode *inode)
 {
@@ -679,7 +657,7 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he
 	return inode;
 }
 
-static inline unsigned long hash(struct super_block *sb, unsigned long hashval)
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
 {
 	unsigned long tmp;
 
@@ -1025,7 +1003,7 @@ void generic_delete_inode(struct inode *inode)
 
 	list_del_init(&inode->i_list);
 	list_del_init(&inode->i_sb_list);
-	inode->i_state|=I_FREEING;
+	inode->i_state |= I_FREEING;
 	inodes_stat.nr_inodes--;
 	spin_unlock(&inode_lock);
 
@@ -1232,13 +1210,15 @@ void file_update_time(struct file *file)
 		return;
 
 	now = current_fs_time(inode->i_sb);
-	if (!timespec_equal(&inode->i_mtime, &now))
+	if (!timespec_equal(&inode->i_mtime, &now)) {
+		inode->i_mtime = now;
 		sync_it = 1;
-	inode->i_mtime = now;
+	}
 
-	if (!timespec_equal(&inode->i_ctime, &now))
+	if (!timespec_equal(&inode->i_ctime, &now)) {
+		inode->i_ctime = now;
 		sync_it = 1;
-	inode->i_ctime = now;
+	}
 
 	if (sync_it)
 		mark_inode_dirty_sync(inode);
@@ -1326,6 +1306,42 @@ void wake_up_inode(struct inode *inode)
 	wake_up_bit(&inode->i_state, __I_LOCK);
 }
 
+/*
+ * We rarely want to lock two inodes that do not have a parent/child
+ * relationship (such as directory, child inode) simultaneously. The
+ * vast majority of file systems should be able to get along fine
+ * without this. Do not use these functions except as a last resort.
+ */
+void inode_double_lock(struct inode *inode1, struct inode *inode2)
+{
+	if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
+		if (inode1)
+			mutex_lock(&inode1->i_mutex);
+		else if (inode2)
+			mutex_lock(&inode2->i_mutex);
+		return;
+	}
+
+	if (inode1 < inode2) {
+		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+	} else {
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
+	}
+}
+EXPORT_SYMBOL(inode_double_lock);
+
+void inode_double_unlock(struct inode *inode1, struct inode *inode2)
+{
+	if (inode1)
+		mutex_unlock(&inode1->i_mutex);
+
+	if (inode2 && inode2 != inode1)
+		mutex_unlock(&inode2->i_mutex);
+}
+EXPORT_SYMBOL(inode_double_unlock);
+
 static __initdata unsigned long ihash_entries;
 static int __init set_ihash_entries(char *str)
 {
diff --git a/fs/internal.h b/fs/internal.h
new file mode 100644
index 000000000000..ea00126c9a59
--- /dev/null
+++ b/fs/internal.h
@@ -0,0 +1,55 @@
+/* fs/ internal definitions
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/ioctl32.h>
+
+struct super_block;
+
+/*
+ * block_dev.c
+ */
+#ifdef CONFIG_BLOCK
+extern struct super_block *blockdev_superblock;
+extern void __init bdev_cache_init(void);
+
+static inline int sb_is_blkdev_sb(struct super_block *sb)
+{
+	return sb == blockdev_superblock;
+}
+
+#else
+static inline void bdev_cache_init(void)
+{
+}
+
+static inline int sb_is_blkdev_sb(struct super_block *sb)
+{
+	return 0;
+}
+#endif
+
+/*
+ * char_dev.c
+ */
+extern void __init chrdev_init(void);
+
+/*
+ * compat_ioctl.c
+ */
+#ifdef CONFIG_COMPAT
+extern struct ioctl_trans ioctl_start[];
+extern int ioctl_table_size;
+#endif
+
+/*
+ * namespace.c
+ */
+extern int copy_mount_options(const void __user *, unsigned long *);
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 78b1deae3fa2..89e8da112a75 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -1,7 +1,7 @@
 /*
  * fs/ioprio.c
  *
- * Copyright (C) 2004 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk>
  *
  * Helper functions for setting/querying io priorities of processes. The
  * system calls closely mimmick getpriority/setpriority, see the man page for
@@ -47,8 +47,8 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
 	/* see wmb() in current_io_context() */
 	smp_read_barrier_depends();
 
-	if (ioc && ioc->set_ioprio)
-		ioc->set_ioprio(ioc, ioprio);
+	if (ioc)
+		ioc->ioprio_changed = 1;
 
 	task_unlock(task);
 	return 0;
@@ -81,7 +81,12 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 	}
 
 	ret = -ESRCH;
-	read_lock_irq(&tasklist_lock);
+	/*
+	 * We want IOPRIO_WHO_PGRP/IOPRIO_WHO_USER to be "atomic",
+	 * so we can't use rcu_read_lock(). See re-copy of ->ioprio
+	 * in copy_process().
+	 */
+	read_lock(&tasklist_lock);
 	switch (which) {
 		case IOPRIO_WHO_PROCESS:
 			if (!who)
@@ -124,7 +129,7 @@ free_uid:
 			ret = -EINVAL;
 	}
 
-	read_unlock_irq(&tasklist_lock);
+	read_unlock(&tasklist_lock);
 	return ret;
 }
 
@@ -145,11 +150,6 @@ int ioprio_best(unsigned short aprio, unsigned short bprio)
 	unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
 	unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
 
-	if (!ioprio_valid(aprio))
-		return bprio;
-	if (!ioprio_valid(bprio))
-		return aprio;
-
 	if (aclass == IOPRIO_CLASS_NONE)
 		aclass = IOPRIO_CLASS_BE;
 	if (bclass == IOPRIO_CLASS_NONE)
@@ -170,7 +170,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
 	int ret = -ESRCH;
 	int tmpio;
 
-	read_lock_irq(&tasklist_lock);
+	read_lock(&tasklist_lock);
 	switch (which) {
 		case IOPRIO_WHO_PROCESS:
 			if (!who)
@@ -221,7 +221,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
 			ret = -EINVAL;
 	}
 
-	read_unlock_irq(&tasklist_lock);
+	read_unlock(&tasklist_lock);
 	return ret;
 }
 
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 14391361c886..c34b862cdbf2 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -96,9 +96,7 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(isofs_inode_cachep))
-		printk(KERN_INFO "iso_inode_cache: not all structures were "
-					"freed\n");
+	kmem_cache_destroy(isofs_inode_cachep);
 }
 
 static int isofs_remount(struct super_block *sb, int *flags, char *data)
@@ -557,11 +555,10 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent)
 	struct iso9660_options		opt;
 	struct isofs_sb_info	      * sbi;
 
-	sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
 	s->s_fs_info = sbi;
-	memset(sbi, 0, sizeof(*sbi));
 
 	if (!parse_options((char *)data, &opt))
 		goto out_freesbi;
@@ -963,30 +960,30 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s,
 			goto abort;
 		}
 		
-		if (nextblk) {
-			while (b_off >= (offset + sect_size)) {
-				struct inode *ninode;
-				
-				offset += sect_size;
-				if (nextblk == 0)
-					goto abort;
-				ninode = isofs_iget(inode->i_sb, nextblk, nextoff);
-				if (!ninode)
-					goto abort;
-				firstext  = ISOFS_I(ninode)->i_first_extent;
-				sect_size = ISOFS_I(ninode)->i_section_size >> ISOFS_BUFFER_BITS(ninode);
-				nextblk   = ISOFS_I(ninode)->i_next_section_block;
-				nextoff   = ISOFS_I(ninode)->i_next_section_offset;
-				iput(ninode);
-				
-				if (++section > 100) {
-					printk("isofs_get_blocks: More than 100 file sections ?!?, aborting...\n");
-					printk("isofs_get_blocks: block=%ld firstext=%u sect_size=%u "
-					       "nextblk=%lu nextoff=%lu\n",
-					       iblock, firstext, (unsigned) sect_size,
-					       nextblk, nextoff);
-					goto abort;
-				}
+		/* On the last section, nextblk == 0, section size is likely to
+		 * exceed sect_size by a partial block, and access beyond the
+		 * end of the file will reach beyond the section size, too.
+		 */
+		while (nextblk && (b_off >= (offset + sect_size))) {
+			struct inode *ninode;
+
+			offset += sect_size;
+			ninode = isofs_iget(inode->i_sb, nextblk, nextoff);
+			if (!ninode)
+				goto abort;
+			firstext  = ISOFS_I(ninode)->i_first_extent;
+			sect_size = ISOFS_I(ninode)->i_section_size >> ISOFS_BUFFER_BITS(ninode);
+			nextblk   = ISOFS_I(ninode)->i_next_section_block;
+			nextoff   = ISOFS_I(ninode)->i_next_section_offset;
+			iput(ninode);
+
+			if (++section > 100) {
+				printk("isofs_get_blocks: More than 100 file sections ?!?, aborting...\n");
+				printk("isofs_get_blocks: block=%ld firstext=%u sect_size=%u "
+				       "nextblk=%lu nextoff=%lu\n",
+				       iblock, firstext, (unsigned) sect_size,
+				       nextblk, nextoff);
+				goto abort;
 			}
 		}
 		
@@ -1238,7 +1235,7 @@ static void isofs_read_inode(struct inode *inode)
 	}
 	inode->i_uid = sbi->s_uid;
 	inode->i_gid = sbi->s_gid;
-	inode->i_blocks = inode->i_blksize = 0;
+	inode->i_blocks = 0;
 
 	ei->i_format_parm[0] = 0;
 	ei->i_format_parm[1] = 0;
@@ -1294,7 +1291,6 @@ static void isofs_read_inode(struct inode *inode)
 			      isonum_711 (de->ext_attr_length));
 
 	/* Set the number of blocks for stat() - should be done before RR */
-	inode->i_blksize = PAGE_CACHE_SIZE; /* For stat() only */
 	inode->i_blocks  = (inode->i_size + 511) >> 9;
 
 	/*
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c
index 81a90e170ac3..fb8fe7a9ddc6 100644
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -14,9 +14,9 @@
  * Convert Unicode 16 to UTF-8 or ASCII.
  */
 static int
-uni16_to_x8(unsigned char *ascii, u16 *uni, int len, struct nls_table *nls)
+uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls)
 {
-	wchar_t *ip, ch;
+	__be16 *ip, ch;
 	unsigned char *op;
 
 	ip = uni;
@@ -24,8 +24,8 @@ uni16_to_x8(unsigned char *ascii, u16 *uni, int len, struct nls_table *nls)
 
 	while ((ch = get_unaligned(ip)) && len) {
 		int llen;
-		ch = be16_to_cpu(ch);
-		if ((llen = nls->uni2char(ch, op, NLS_MAX_CHARSET_SIZE)) > 0)
+		llen = nls->uni2char(be16_to_cpu(ch), op, NLS_MAX_CHARSET_SIZE);
+		if (llen > 0)
 			op += llen;
 		else
 			*op++ = '?';
@@ -82,7 +82,7 @@ get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, st
 		len = wcsntombs_be(outname, de->name,
 				   de->name_len[0] >> 1, PAGE_SIZE);
 	} else {
-		len = uni16_to_x8(outname, (u16 *) de->name,
+		len = uni16_to_x8(outname, (__be16 *) de->name,
 				  de->name_len[0] >> 1, nls);
 	}
 	if ((len > 2) && (outname[len-2] == ';') && (outname[len-1] == '1')) {
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index e7ba0c30e071..c04b3a14a3e9 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -6,7 +6,6 @@
  *  (C) 1991  Linus Torvalds - minix filesystem
  */
 
-#include <linux/config.h>	/* Joliet? */
 #include <linux/smp_lock.h>
 #include "isofs.h"
 
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 47678a26c13b..0208cc7ac5d0 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -1,6 +1,6 @@
 /*
  * linux/fs/checkpoint.c
- * 
+ *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
  *
  * Copyright 1999 Red Hat Software --- All Rights Reserved
@@ -9,8 +9,8 @@
  * the terms of the GNU General Public License, version 2, or at your
  * option, any later version, incorporated herein by reference.
  *
- * Checkpoint routines for the generic filesystem journaling code.  
- * Part of the ext2fs journaling system.  
+ * Checkpoint routines for the generic filesystem journaling code.
+ * Part of the ext2fs journaling system.
  *
  * Checkpointing is the process of ensuring that a section of the log is
  * committed fully to disk, so that that portion of the log can be
@@ -145,6 +145,7 @@ void __log_wait_for_space(journal_t *journal)
  * jbd_unlock_bh_state().
  */
 static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
+	__releases(journal->j_list_lock)
 {
 	get_bh(bh);
 	spin_unlock(&journal->j_list_lock);
@@ -225,7 +226,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Try to flush one buffer from the checkpoint list to disk.
  *
  * Return 1 if something happened which requires us to abort the current
- * scan of the checkpoint list.  
+ * scan of the checkpoint list.
  *
  * Called with j_list_lock held and drops it if 1 is returned
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -269,7 +270,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		 * possibly block, while still holding the journal lock.
 		 * We cannot afford to let the transaction logic start
 		 * messing around with this buffer before we write it to
-		 * disk, as that would break recoverability.  
+		 * disk, as that would break recoverability.
 		 */
 		BUFFER_TRACE(bh, "queue");
 		get_bh(bh);
@@ -292,7 +293,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
  * Perform an actual checkpoint. We take the first transaction on the
  * list of transactions to be checkpointed and send all its buffers
  * to disk. We submit larger chunks of data at once.
- * 
+ *
  * The journal should be locked before calling this function.
  */
 int log_do_checkpoint(journal_t *journal)
@@ -303,10 +304,10 @@ int log_do_checkpoint(journal_t *journal)
 
 	jbd_debug(1, "Start checkpoint\n");
 
-	/* 
+	/*
 	 * First thing: if there are any transactions in the log which
 	 * don't need checkpointing, just eliminate them from the
-	 * journal straight away.  
+	 * journal straight away.
 	 */
 	result = cleanup_journal_tail(journal);
 	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
@@ -384,9 +385,9 @@ out:
  * we have already got rid of any since the last update of the log tail
  * in the journal superblock.  If so, we can instantly roll the
  * superblock forward to remove those transactions from the log.
- * 
+ *
  * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
- * 
+ *
  * Called with the journal lock held.
  *
  * This is the only part of the journaling code which really needs to be
@@ -403,8 +404,8 @@ int cleanup_journal_tail(journal_t *journal)
 	unsigned long	blocknr, freed;
 
 	/* OK, work out the oldest transaction remaining in the log, and
-	 * the log block it starts at. 
-	 * 
+	 * the log block it starts at.
+	 *
 	 * If the log is now empty, we need to work out which is the
 	 * next transaction ID we will write, and where it will
 	 * start. */
@@ -479,7 +480,7 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
 	if (!jh)
 		return 0;
 
- 	last_jh = jh->b_cpprev;
+	last_jh = jh->b_cpprev;
 	do {
 		jh = next_jh;
 		next_jh = jh->b_cpnext;
@@ -557,7 +558,7 @@ out:
 	return ret;
 }
 
-/* 
+/*
  * journal_remove_checkpoint: called after a buffer has been committed
  * to disk (either by being write-back flushed to disk, or being
  * committed to the log).
@@ -635,7 +636,7 @@ out:
  * Called with the journal locked.
  * Called with j_list_lock held.
  */
-void __journal_insert_checkpoint(struct journal_head *jh, 
+void __journal_insert_checkpoint(struct journal_head *jh,
 			       transaction_t *transaction)
 {
 	JBUFFER_TRACE(jh, "entry");
@@ -657,7 +658,7 @@ void __journal_insert_checkpoint(struct journal_head *jh,
 
 /*
  * We've finished with this transaction structure: adios...
- * 
+ *
  * The transaction must have no links except for the checkpoint by this
  * point.
  *
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 32a8caf0c41e..10be51290a27 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/commit.c
+ * linux/fs/jbd/commit.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
  *
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index f66724ce443a..b85c686b60db 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/journal.c
+ * linux/fs/jbd/journal.c
  *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
  *
@@ -181,7 +181,7 @@ loop:
 						transaction->t_expires))
 			should_sleep = 0;
 		if (journal->j_flags & JFS_UNMOUNT)
- 			should_sleep = 0;
+			should_sleep = 0;
 		if (should_sleep) {
 			spin_unlock(&journal->j_state_lock);
 			schedule();
@@ -271,7 +271,7 @@ static void journal_kill_thread(journal_t *journal)
 int journal_write_metadata_buffer(transaction_t *transaction,
 				  struct journal_head  *jh_in,
 				  struct journal_head **jh_out,
-				  int blocknr)
+				  unsigned long blocknr)
 {
 	int need_copy_out = 0;
 	int done_copy_out = 0;
@@ -578,7 +578,7 @@ int journal_next_log_block(journal_t *journal, unsigned long *retp)
  * this is a no-op.  If needed, we can use j_blk_offset - everything is
  * ready.
  */
-int journal_bmap(journal_t *journal, unsigned long blocknr, 
+int journal_bmap(journal_t *journal, unsigned long blocknr,
 		 unsigned long *retp)
 {
 	int err = 0;
@@ -696,13 +696,13 @@ fail:
  *  @bdev: Block device on which to create the journal
  *  @fs_dev: Device which hold journalled filesystem for this journal.
  *  @start: Block nr Start of journal.
- *  @len:  Lenght of the journal in blocks.
+ *  @len:  Length of the journal in blocks.
  *  @blocksize: blocksize of journalling device
  *  @returns: a newly created journal_t *
- *  
+ *
  *  journal_init_dev creates a journal which maps a fixed contiguous
  *  range of blocks on an arbitrary block device.
- * 
+ *
  */
 journal_t * journal_init_dev(struct block_device *bdev,
 			struct block_device *fs_dev,
@@ -715,18 +715,8 @@ journal_t * journal_init_dev(struct block_device *bdev,
 	if (!journal)
 		return NULL;
 
-	journal->j_dev = bdev;
-	journal->j_fs_dev = fs_dev;
-	journal->j_blk_offset = start;
-	journal->j_maxlen = len;
-	journal->j_blocksize = blocksize;
-
-	bh = __getblk(journal->j_dev, start, journal->j_blocksize);
-	J_ASSERT(bh != NULL);
-	journal->j_sb_buffer = bh;
-	journal->j_superblock = (journal_superblock_t *)bh->b_data;
-
 	/* journal descriptor can store up to n blocks -bzzz */
+	journal->j_blocksize = blocksize;
 	n = journal->j_blocksize / sizeof(journal_block_tag_t);
 	journal->j_wbufsize = n;
 	journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
@@ -735,15 +725,25 @@ journal_t * journal_init_dev(struct block_device *bdev,
 			__FUNCTION__);
 		kfree(journal);
 		journal = NULL;
+		goto out;
 	}
+	journal->j_dev = bdev;
+	journal->j_fs_dev = fs_dev;
+	journal->j_blk_offset = start;
+	journal->j_maxlen = len;
 
+	bh = __getblk(journal->j_dev, start, journal->j_blocksize);
+	J_ASSERT(bh != NULL);
+	journal->j_sb_buffer = bh;
+	journal->j_superblock = (journal_superblock_t *)bh->b_data;
+out:
 	return journal;
 }
- 
-/** 
+
+/**
  *  journal_t * journal_init_inode () - creates a journal which maps to a inode.
  *  @inode: An inode to create the journal in
- *  
+ *
  * journal_init_inode creates a journal which maps an on-disk inode as
  * the journal.  The inode must exist already, must support bmap() and
  * must have all data blocks preallocated.
@@ -763,7 +763,7 @@ journal_t * journal_init_inode (struct inode *inode)
 	journal->j_inode = inode;
 	jbd_debug(1,
 		  "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
-		  journal, inode->i_sb->s_id, inode->i_ino, 
+		  journal, inode->i_sb->s_id, inode->i_ino,
 		  (long long) inode->i_size,
 		  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
 
@@ -798,10 +798,10 @@ journal_t * journal_init_inode (struct inode *inode)
 	return journal;
 }
 
-/* 
+/*
  * If the journal init or create aborts, we need to mark the journal
  * superblock as being NULL to prevent the journal destroy from writing
- * back a bogus superblock. 
+ * back a bogus superblock.
  */
 static void journal_fail_superblock (journal_t *journal)
 {
@@ -820,7 +820,7 @@ static void journal_fail_superblock (journal_t *journal)
 static int journal_reset(journal_t *journal)
 {
 	journal_superblock_t *sb = journal->j_superblock;
-	unsigned int first, last;
+	unsigned long first, last;
 
 	first = be32_to_cpu(sb->s_first);
 	last = be32_to_cpu(sb->s_maxlen);
@@ -844,13 +844,13 @@ static int journal_reset(journal_t *journal)
 	return 0;
 }
 
-/** 
+/**
  * int journal_create() - Initialise the new journal file
  * @journal: Journal to create. This structure must have been initialised
- * 
+ *
  * Given a journal_t structure which tells us which disk blocks we can
  * use, create a new journal superblock and initialise all of the
- * journal fields from scratch.  
+ * journal fields from scratch.
  **/
 int journal_create(journal_t *journal)
 {
@@ -915,7 +915,7 @@ int journal_create(journal_t *journal)
 	return journal_reset(journal);
 }
 
-/** 
+/**
  * void journal_update_superblock() - Update journal sb on disk.
  * @journal: The journal to update.
  * @wait: Set to '0' if you don't want to wait for IO completion.
@@ -939,7 +939,7 @@ void journal_update_superblock(journal_t *journal, int wait)
 				journal->j_transaction_sequence) {
 		jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
 			"(start %ld, seq %d, errno %d)\n",
-			journal->j_tail, journal->j_tail_sequence, 
+			journal->j_tail, journal->j_tail_sequence,
 			journal->j_errno);
 		goto out;
 	}
@@ -1062,7 +1062,7 @@ static int load_superblock(journal_t *journal)
 /**
  * int journal_load() - Read journal from disk.
  * @journal: Journal to act on.
- * 
+ *
  * Given a journal_t structure which tells us which disk blocks contain
  * a journal, read the journal from disk to initialise the in-memory
  * structures.
@@ -1094,7 +1094,7 @@ int journal_load(journal_t *journal)
 	/*
 	 * Create a slab for this blocksize
 	 */
-	err = journal_create_jbd_slab(cpu_to_be32(sb->s_blocksize));
+	err = journal_create_jbd_slab(be32_to_cpu(sb->s_blocksize));
 	if (err)
 		return err;
 
@@ -1172,9 +1172,9 @@ void journal_destroy(journal_t *journal)
  * @compat: bitmask of compatible features
  * @ro: bitmask of features that force read-only mount
  * @incompat: bitmask of incompatible features
- * 
+ *
  * Check whether the journal uses all of a given set of
- * features.  Return true (non-zero) if it does. 
+ * features.  Return true (non-zero) if it does.
  **/
 
 int journal_check_used_features (journal_t *journal, unsigned long compat,
@@ -1203,7 +1203,7 @@ int journal_check_used_features (journal_t *journal, unsigned long compat,
  * @compat: bitmask of compatible features
  * @ro: bitmask of features that force read-only mount
  * @incompat: bitmask of incompatible features
- * 
+ *
  * Check whether the journaling code supports the use of
  * all of a given set of features on this journal.  Return true
  * (non-zero) if it can. */
@@ -1241,7 +1241,7 @@ int journal_check_available_features (journal_t *journal, unsigned long compat,
  * @incompat: bitmask of incompatible features
  *
  * Mark a given journal feature as present on the
- * superblock.  Returns true if the requested features could be set. 
+ * superblock.  Returns true if the requested features could be set.
  *
  */
 
@@ -1327,7 +1327,7 @@ static int journal_convert_superblock_v1(journal_t *journal,
 /**
  * int journal_flush () - Flush journal
  * @journal: Journal to act on.
- * 
+ *
  * Flush all data for a given journal to disk and empty the journal.
  * Filesystems can use this when remounting readonly to ensure that
  * recovery does not need to happen on remount.
@@ -1394,7 +1394,7 @@ int journal_flush(journal_t *journal)
  * int journal_wipe() - Wipe journal contents
  * @journal: Journal to act on.
  * @write: flag (see below)
- * 
+ *
  * Wipe out all of the contents of a journal, safely.  This will produce
  * a warning if the journal contains any valid recovery information.
  * Must be called between journal_init_*() and journal_load().
@@ -1449,7 +1449,7 @@ static const char *journal_dev_name(journal_t *journal, char *buffer)
 
 /*
  * Journal abort has very specific semantics, which we describe
- * for journal abort. 
+ * for journal abort.
  *
  * Two internal function, which provide abort to te jbd layer
  * itself are here.
@@ -1504,7 +1504,7 @@ static void __journal_abort_soft (journal_t *journal, int errno)
  * Perform a complete, immediate shutdown of the ENTIRE
  * journal (not of a single transaction).  This operation cannot be
  * undone without closing and reopening the journal.
- *           
+ *
  * The journal_abort function is intended to support higher level error
  * recovery mechanisms such as the ext2/ext3 remount-readonly error
  * mode.
@@ -1538,7 +1538,7 @@ static void __journal_abort_soft (journal_t *journal, int errno)
  * supply an errno; a null errno implies that absolutely no further
  * writes are done to the journal (unless there are any already in
  * progress).
- * 
+ *
  */
 
 void journal_abort(journal_t *journal, int errno)
@@ -1546,7 +1546,7 @@ void journal_abort(journal_t *journal, int errno)
 	__journal_abort_soft(journal, errno);
 }
 
-/** 
+/**
  * int journal_errno () - returns the journal's error state.
  * @journal: journal to examine.
  *
@@ -1570,7 +1570,7 @@ int journal_errno(journal_t *journal)
 	return err;
 }
 
-/** 
+/**
  * int journal_clear_err () - clears the journal's error state
  * @journal: journal to act on.
  *
@@ -1590,7 +1590,7 @@ int journal_clear_err(journal_t *journal)
 	return err;
 }
 
-/** 
+/**
  * void journal_ack_err() - Ack journal err.
  * @journal: journal to act on.
  *
@@ -1612,7 +1612,7 @@ int journal_blocks_per_page(struct inode *inode)
 
 /*
  * Simple support for retrying memory allocations.  Introduced to help to
- * debug different VM deadlock avoidance strategies. 
+ * debug different VM deadlock avoidance strategies.
  */
 void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry)
 {
@@ -2047,13 +2047,7 @@ static int __init journal_init(void)
 {
 	int ret;
 
-/* Static check for data structure consistency.  There's no code
- * invoked --- we'll just get a linker failure if things aren't right.
- */
-	extern void journal_bad_superblock_size(void);
-	if (sizeof(struct journal_superblock_s) != 1024)
-		journal_bad_superblock_size();
-
+	BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
 
 	ret = journal_init_caches();
 	if (ret != 0)
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index de5bafb4e853..11563fe2a52b 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -1,6 +1,6 @@
 /*
  * linux/fs/recovery.c
- * 
+ *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
  *
  * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
@@ -10,7 +10,7 @@
  * option, any later version, incorporated herein by reference.
  *
  * Journal recovery routines for the generic filesystem journaling code;
- * part of the ext2fs journaling system.  
+ * part of the ext2fs journaling system.
  */
 
 #ifndef __KERNEL__
@@ -25,9 +25,9 @@
 
 /*
  * Maintain information about the progress of the recovery job, so that
- * the different passes can carry information between them. 
+ * the different passes can carry information between them.
  */
-struct recovery_info 
+struct recovery_info
 {
 	tid_t		start_transaction;
 	tid_t		end_transaction;
@@ -46,7 +46,7 @@ static int scan_revoke_records(journal_t *, struct buffer_head *,
 #ifdef __KERNEL__
 
 /* Release readahead buffers after use */
-void journal_brelse_array(struct buffer_head *b[], int n)
+static void journal_brelse_array(struct buffer_head *b[], int n)
 {
 	while (--n >= 0)
 		brelse (b[n]);
@@ -116,7 +116,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
 	err = 0;
 
 failed:
-	if (nbufs) 
+	if (nbufs)
 		journal_brelse_array(bufs, nbufs);
 	return err;
 }
@@ -128,7 +128,7 @@ failed:
  * Read a block from the journal
  */
 
-static int jread(struct buffer_head **bhp, journal_t *journal, 
+static int jread(struct buffer_head **bhp, journal_t *journal,
 		 unsigned int offset)
 {
 	int err;
@@ -212,14 +212,14 @@ do {									\
 /**
  * journal_recover - recovers a on-disk journal
  * @journal: the journal to recover
- * 
+ *
  * The primary function for recovering the log contents when mounting a
- * journaled device.  
+ * journaled device.
  *
  * Recovery is done in three passes.  In the first pass, we look for the
  * end of the log.  In the second, we assemble the list of revoke
  * blocks.  In the third and final pass, we replay any un-revoked blocks
- * in the log.  
+ * in the log.
  */
 int journal_recover(journal_t *journal)
 {
@@ -231,10 +231,10 @@ int journal_recover(journal_t *journal)
 	memset(&info, 0, sizeof(info));
 	sb = journal->j_superblock;
 
-	/* 
+	/*
 	 * The journal superblock's s_start field (the current log head)
 	 * is always zero if, and only if, the journal was cleanly
-	 * unmounted.  
+	 * unmounted.
 	 */
 
 	if (!sb->s_start) {
@@ -253,7 +253,7 @@ int journal_recover(journal_t *journal)
 	jbd_debug(0, "JBD: recovery, exit status %d, "
 		  "recovered transactions %u to %u\n",
 		  err, info.start_transaction, info.end_transaction);
-	jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n", 
+	jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
 		  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
 
 	/* Restart the log at the next transaction ID, thus invalidating
@@ -268,15 +268,15 @@ int journal_recover(journal_t *journal)
 /**
  * journal_skip_recovery - Start journal and wipe exiting records
  * @journal: journal to startup
- * 
+ *
  * Locate any valid recovery information from the journal and set up the
  * journal structures in memory to ignore it (presumably because the
- * caller has evidence that it is out of date).  
+ * caller has evidence that it is out of date).
  * This function does'nt appear to be exorted..
  *
  * We perform one pass over the journal to allow us to tell the user how
  * much recovery information is being erased, and to let us initialise
- * the journal transaction sequence numbers to the next unused ID. 
+ * the journal transaction sequence numbers to the next unused ID.
  */
 int journal_skip_recovery(journal_t *journal)
 {
@@ -297,7 +297,7 @@ int journal_skip_recovery(journal_t *journal)
 #ifdef CONFIG_JBD_DEBUG
 		int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
 #endif
-		jbd_debug(0, 
+		jbd_debug(0,
 			  "JBD: ignoring %d transaction%s from the journal.\n",
 			  dropped, (dropped == 1) ? "" : "s");
 		journal->j_transaction_sequence = ++info.end_transaction;
@@ -314,7 +314,7 @@ static int do_one_pass(journal_t *journal,
 	unsigned long		next_log_block;
 	int			err, success = 0;
 	journal_superblock_t *	sb;
-	journal_header_t * 	tmp;
+	journal_header_t *	tmp;
 	struct buffer_head *	bh;
 	unsigned int		sequence;
 	int			blocktype;
@@ -324,10 +324,10 @@ static int do_one_pass(journal_t *journal,
 	MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
 			       / sizeof(journal_block_tag_t));
 
-	/* 
+	/*
 	 * First thing is to establish what we expect to find in the log
 	 * (in terms of transaction IDs), and where (in terms of log
-	 * block offsets): query the superblock.  
+	 * block offsets): query the superblock.
 	 */
 
 	sb = journal->j_superblock;
@@ -344,7 +344,7 @@ static int do_one_pass(journal_t *journal,
 	 * Now we walk through the log, transaction by transaction,
 	 * making sure that each transaction has a commit block in the
 	 * expected place.  Each complete transaction gets replayed back
-	 * into the main filesystem. 
+	 * into the main filesystem.
 	 */
 
 	while (1) {
@@ -379,8 +379,8 @@ static int do_one_pass(journal_t *journal,
 		next_log_block++;
 		wrap(journal, next_log_block);
 
-		/* What kind of buffer is it? 
-		 * 
+		/* What kind of buffer is it?
+		 *
 		 * If it is a descriptor block, check that it has the
 		 * expected sequence number.  Otherwise, we're all done
 		 * here. */
@@ -394,7 +394,7 @@ static int do_one_pass(journal_t *journal,
 
 		blocktype = be32_to_cpu(tmp->h_blocktype);
 		sequence = be32_to_cpu(tmp->h_sequence);
-		jbd_debug(3, "Found magic %d, sequence %d\n", 
+		jbd_debug(3, "Found magic %d, sequence %d\n",
 			  blocktype, sequence);
 
 		if (sequence != next_commit_ID) {
@@ -438,7 +438,7 @@ static int do_one_pass(journal_t *journal,
 					/* Recover what we can, but
 					 * report failure at the end. */
 					success = err;
-					printk (KERN_ERR 
+					printk (KERN_ERR
 						"JBD: IO error %d recovering "
 						"block %ld in log\n",
 						err, io_block);
@@ -452,7 +452,7 @@ static int do_one_pass(journal_t *journal,
 					 * revoked, then we're all done
 					 * here. */
 					if (journal_test_revoke
-					    (journal, blocknr, 
+					    (journal, blocknr,
 					     next_commit_ID)) {
 						brelse(obh);
 						++info->nr_revoke_hits;
@@ -465,7 +465,7 @@ static int do_one_pass(journal_t *journal,
 							blocknr,
 							journal->j_blocksize);
 					if (nbh == NULL) {
-						printk(KERN_ERR 
+						printk(KERN_ERR
 						       "JBD: Out of memory "
 						       "during recovery.\n");
 						err = -ENOMEM;
@@ -537,7 +537,7 @@ static int do_one_pass(journal_t *journal,
 	}
 
  done:
-	/* 
+	/*
 	 * We broke out of the log scan loop: either we came to the
 	 * known end of the log or we found an unexpected block in the
 	 * log.  If the latter happened, then we know that the "current"
@@ -567,7 +567,7 @@ static int do_one_pass(journal_t *journal,
 
 /* Scan a revoke record, marking all blocks mentioned as revoked. */
 
-static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, 
+static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
 			       tid_t sequence, struct recovery_info *info)
 {
 	journal_revoke_header_t *header;
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index a56144183462..c532429d8d9b 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -1,6 +1,6 @@
 /*
  * linux/fs/revoke.c
- * 
+ *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
  *
  * Copyright 2000 Red Hat corp --- All Rights Reserved
@@ -15,10 +15,10 @@
  * Revoke is the mechanism used to prevent old log records for deleted
  * metadata from being replayed on top of newer data using the same
  * blocks.  The revoke mechanism is used in two separate places:
- * 
+ *
  * + Commit: during commit we write the entire list of the current
  *   transaction's revoked blocks to the journal
- * 
+ *
  * + Recovery: during recovery we record the transaction ID of all
  *   revoked blocks.  If there are multiple revoke records in the log
  *   for a single block, only the last one counts, and if there is a log
@@ -29,7 +29,7 @@
  * single transaction:
  *
  * Block is revoked and then journaled:
- *   The desired end result is the journaling of the new block, so we 
+ *   The desired end result is the journaling of the new block, so we
  *   cancel the revoke before the transaction commits.
  *
  * Block is journaled and then revoked:
@@ -41,7 +41,7 @@
  *   transaction must have happened after the block was journaled and so
  *   the revoke must take precedence.
  *
- * Block is revoked and then written as data: 
+ * Block is revoked and then written as data:
  *   The data write is allowed to succeed, but the revoke is _not_
  *   cancelled.  We still need to prevent old log records from
  *   overwriting the new data.  We don't even need to clear the revoke
@@ -54,7 +54,7 @@
  *			buffer has not been revoked, and cancel_revoke
  *			need do nothing.
  * RevokeValid set, Revoked set:
- *			buffer has been revoked.  
+ *			buffer has been revoked.
  */
 
 #ifndef __KERNEL__
@@ -77,7 +77,7 @@ static kmem_cache_t *revoke_table_cache;
    journal replay, this involves recording the transaction ID of the
    last transaction to revoke this block. */
 
-struct jbd_revoke_record_s 
+struct jbd_revoke_record_s
 {
 	struct list_head  hash;
 	tid_t		  sequence;	/* Used for recovery only */
@@ -90,8 +90,8 @@ struct jbd_revoke_table_s
 {
 	/* It is conceivable that we might want a larger hash table
 	 * for recovery.  Must be a power of two. */
-	int		  hash_size; 
-	int		  hash_shift; 
+	int		  hash_size;
+	int		  hash_shift;
 	struct list_head *hash_table;
 };
 
@@ -301,22 +301,22 @@ void journal_destroy_revoke(journal_t *journal)
 
 #ifdef __KERNEL__
 
-/* 
+/*
  * journal_revoke: revoke a given buffer_head from the journal.  This
  * prevents the block from being replayed during recovery if we take a
  * crash after this current transaction commits.  Any subsequent
  * metadata writes of the buffer in this transaction cancel the
- * revoke.  
+ * revoke.
  *
  * Note that this call may block --- it is up to the caller to make
  * sure that there are no further calls to journal_write_metadata
  * before the revoke is complete.  In ext3, this implies calling the
  * revoke before clearing the block bitmap when we are deleting
- * metadata. 
+ * metadata.
  *
  * Revoke performs a journal_forget on any buffer_head passed in as a
  * parameter, but does _not_ forget the buffer_head if the bh was only
- * found implicitly. 
+ * found implicitly.
  *
  * bh_in may not be a journalled buffer - it may have come off
  * the hash tables without an attached journal_head.
@@ -325,7 +325,7 @@ void journal_destroy_revoke(journal_t *journal)
  * by one.
  */
 
-int journal_revoke(handle_t *handle, unsigned long blocknr, 
+int journal_revoke(handle_t *handle, unsigned long blocknr,
 		   struct buffer_head *bh_in)
 {
 	struct buffer_head *bh = NULL;
@@ -487,7 +487,7 @@ void journal_switch_revoke_table(journal_t *journal)
 	else
 		journal->j_revoke = journal->j_revoke_table[0];
 
-	for (i = 0; i < journal->j_revoke->hash_size; i++) 
+	for (i = 0; i < journal->j_revoke->hash_size; i++)
 		INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
 }
 
@@ -498,7 +498,7 @@ void journal_switch_revoke_table(journal_t *journal)
  * Called with the journal lock held.
  */
 
-void journal_write_revoke_records(journal_t *journal, 
+void journal_write_revoke_records(journal_t *journal,
 				  transaction_t *transaction)
 {
 	struct journal_head *descriptor;
@@ -507,7 +507,7 @@ void journal_write_revoke_records(journal_t *journal,
 	struct list_head *hash_list;
 	int i, offset, count;
 
-	descriptor = NULL; 
+	descriptor = NULL;
 	offset = 0;
 	count = 0;
 
@@ -519,10 +519,10 @@ void journal_write_revoke_records(journal_t *journal,
 		hash_list = &revoke->hash_table[i];
 
 		while (!list_empty(hash_list)) {
-			record = (struct jbd_revoke_record_s *) 
+			record = (struct jbd_revoke_record_s *)
 				hash_list->next;
 			write_one_revoke_record(journal, transaction,
-						&descriptor, &offset, 
+						&descriptor, &offset,
 						record);
 			count++;
 			list_del(&record->hash);
@@ -534,14 +534,14 @@ void journal_write_revoke_records(journal_t *journal,
 	jbd_debug(1, "Wrote %d revoke records\n", count);
 }
 
-/* 
+/*
  * Write out one revoke record.  We need to create a new descriptor
- * block if the old one is full or if we have not already created one.  
+ * block if the old one is full or if we have not already created one.
  */
 
-static void write_one_revoke_record(journal_t *journal, 
+static void write_one_revoke_record(journal_t *journal,
 				    transaction_t *transaction,
-				    struct journal_head **descriptorp, 
+				    struct journal_head **descriptorp,
 				    int *offsetp,
 				    struct jbd_revoke_record_s *record)
 {
@@ -584,21 +584,21 @@ static void write_one_revoke_record(journal_t *journal,
 		*descriptorp = descriptor;
 	}
 
-	* ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) = 
+	* ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
 		cpu_to_be32(record->blocknr);
 	offset += 4;
 	*offsetp = offset;
 }
 
-/* 
+/*
  * Flush a revoke descriptor out to the journal.  If we are aborting,
  * this is a noop; otherwise we are generating a buffer which needs to
  * be waited for during commit, so it has to go onto the appropriate
  * journal buffer list.
  */
 
-static void flush_descriptor(journal_t *journal, 
-			     struct journal_head *descriptor, 
+static void flush_descriptor(journal_t *journal,
+			     struct journal_head *descriptor,
 			     int offset)
 {
 	journal_revoke_header_t *header;
@@ -618,7 +618,7 @@ static void flush_descriptor(journal_t *journal,
 }
 #endif
 
-/* 
+/*
  * Revoke support for recovery.
  *
  * Recovery needs to be able to:
@@ -629,7 +629,7 @@ static void flush_descriptor(journal_t *journal,
  *  check whether a given block in a given transaction should be replayed
  *  (ie. has not been revoked by a revoke record in that or a subsequent
  *  transaction)
- * 
+ *
  *  empty the revoke table after recovery.
  */
 
@@ -637,11 +637,11 @@ static void flush_descriptor(journal_t *journal,
  * First, setting revoke records.  We create a new revoke record for
  * every block ever revoked in the log as we scan it for recovery, and
  * we update the existing records if we find multiple revokes for a
- * single block. 
+ * single block.
  */
 
-int journal_set_revoke(journal_t *journal, 
-		       unsigned long blocknr, 
+int journal_set_revoke(journal_t *journal,
+		       unsigned long blocknr,
 		       tid_t sequence)
 {
 	struct jbd_revoke_record_s *record;
@@ -653,18 +653,18 @@ int journal_set_revoke(journal_t *journal,
 		if (tid_gt(sequence, record->sequence))
 			record->sequence = sequence;
 		return 0;
-	} 
+	}
 	return insert_revoke_hash(journal, blocknr, sequence);
 }
 
-/* 
+/*
  * Test revoke records.  For a given block referenced in the log, has
  * that block been revoked?  A revoke record with a given transaction
  * sequence number revokes all blocks in that transaction and earlier
  * ones, but later transactions still need replayed.
  */
 
-int journal_test_revoke(journal_t *journal, 
+int journal_test_revoke(journal_t *journal,
 			unsigned long blocknr,
 			tid_t sequence)
 {
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index f5169a96260e..d5c63047a8b3 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1,6 +1,6 @@
 /*
  * linux/fs/transaction.c
- * 
+ *
  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
  *
  * Copyright 1998 Red Hat corp --- All Rights Reserved
@@ -10,7 +10,7 @@
  * option, any later version, incorporated herein by reference.
  *
  * Generic filesystem transaction handling code; part of the ext2fs
- * journaling system.  
+ * journaling system.
  *
  * This file manages transactions (compound commits managed by the
  * journaling code) and handles (individual atomic operations by the
@@ -74,7 +74,7 @@ get_transaction(journal_t *journal, transaction_t *transaction)
  * start_this_handle: Given a handle, deal with any locking or stalling
  * needed to make sure that there is enough journal space for the handle
  * to begin.  Attach the handle to a transaction and set up the
- * transaction's buffer credits.  
+ * transaction's buffer credits.
  */
 
 static int start_this_handle(journal_t *journal, handle_t *handle)
@@ -117,7 +117,7 @@ repeat_locked:
 	if (is_journal_aborted(journal) ||
 	    (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
 		spin_unlock(&journal->j_state_lock);
-		ret = -EROFS; 
+		ret = -EROFS;
 		goto out;
 	}
 
@@ -182,7 +182,7 @@ repeat_locked:
 		goto repeat;
 	}
 
-	/* 
+	/*
 	 * The commit code assumes that it can get enough log space
 	 * without forcing a checkpoint.  This is *critical* for
 	 * correctness: a checkpoint of a buffer which is also
@@ -191,7 +191,7 @@ repeat_locked:
 	 *
 	 * We must therefore ensure the necessary space in the journal
 	 * *before* starting to dirty potentially checkpointed buffers
-	 * in the new transaction. 
+	 * in the new transaction.
 	 *
 	 * The worst part is, any transaction currently committing can
 	 * reduce the free space arbitrarily.  Be careful to account for
@@ -246,13 +246,13 @@ static handle_t *new_handle(int nblocks)
 }
 
 /**
- * handle_t *journal_start() - Obtain a new handle.  
+ * handle_t *journal_start() - Obtain a new handle.
  * @journal: Journal to start transaction on.
  * @nblocks: number of block buffer we might modify
  *
  * We make sure that the transaction can guarantee at least nblocks of
  * modified buffers in the log.  We block until the log can guarantee
- * that much space.  
+ * that much space.
  *
  * This function is visible to journal users (like ext3fs), so is not
  * called with the journal already locked.
@@ -292,11 +292,11 @@ handle_t *journal_start(journal_t *journal, int nblocks)
  * int journal_extend() - extend buffer credits.
  * @handle:  handle to 'extend'
  * @nblocks: nr blocks to try to extend by.
- * 
+ *
  * Some transactions, such as large extends and truncates, can be done
  * atomically all at once or in several stages.  The operation requests
  * a credit for a number of buffer modications in advance, but can
- * extend its credit if it needs more.  
+ * extend its credit if it needs more.
  *
  * journal_extend tries to give the running handle more buffer credits.
  * It does not guarantee that allocation - this is a best-effort only.
@@ -363,7 +363,7 @@ out:
  * int journal_restart() - restart a handle .
  * @handle:  handle to restart
  * @nblocks: nr credits requested
- * 
+ *
  * Restart a handle for a multi-transaction filesystem
  * operation.
  *
@@ -462,7 +462,7 @@ void journal_lock_updates(journal_t *journal)
 /**
  * void journal_unlock_updates (journal_t* journal) - release barrier
  * @journal:  Journal to release the barrier on.
- * 
+ *
  * Release a transaction barrier obtained with journal_lock_updates().
  *
  * Should be called without the journal lock held.
@@ -547,8 +547,8 @@ repeat:
 	jbd_lock_bh_state(bh);
 
 	/* We now hold the buffer lock so it is safe to query the buffer
-	 * state.  Is the buffer dirty? 
-	 * 
+	 * state.  Is the buffer dirty?
+	 *
 	 * If so, there are two possibilities.  The buffer may be
 	 * non-journaled, and undergoing a quite legitimate writeback.
 	 * Otherwise, it is journaled, and we don't expect dirty buffers
@@ -566,7 +566,7 @@ repeat:
 		 */
 		if (jh->b_transaction) {
 			J_ASSERT_JH(jh,
-				jh->b_transaction == transaction || 
+				jh->b_transaction == transaction ||
 				jh->b_transaction ==
 					journal->j_committing_transaction);
 			if (jh->b_next_transaction)
@@ -580,7 +580,7 @@ repeat:
 		 */
 		JBUFFER_TRACE(jh, "Unexpected dirty buffer");
 		jbd_unexpected_dirty_buffer(jh);
- 	}
+	}
 
 	unlock_buffer(bh);
 
@@ -653,7 +653,7 @@ repeat:
 		 * buffer had better remain locked during the kmalloc,
 		 * but that should be true --- we hold the journal lock
 		 * still and the buffer is already on the BUF_JOURNAL
-		 * list so won't be flushed. 
+		 * list so won't be flushed.
 		 *
 		 * Subtle point, though: if this is a get_undo_access,
 		 * then we will be relying on the frozen_data to contain
@@ -765,8 +765,8 @@ int journal_get_write_access(handle_t *handle, struct buffer_head *bh)
  * manually rather than reading off disk), then we need to keep the
  * buffer_head locked until it has been completely filled with new
  * data.  In this case, we should be able to make the assertion that
- * the bh is not already part of an existing transaction.  
- * 
+ * the bh is not already part of an existing transaction.
+ *
  * The buffer should already be locked by the caller by this point.
  * There is no lock ranking violation: it was a newly created,
  * unlocked buffer beforehand. */
@@ -778,7 +778,7 @@ int journal_get_write_access(handle_t *handle, struct buffer_head *bh)
  *
  * Call this if you create a new bh.
  */
-int journal_get_create_access(handle_t *handle, struct buffer_head *bh) 
+int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
@@ -847,13 +847,13 @@ out:
  * do not reuse freed space until the deallocation has been committed,
  * since if we overwrote that space we would make the delete
  * un-rewindable in case of a crash.
- * 
+ *
  * To deal with that, journal_get_undo_access requests write access to a
  * buffer for parts of non-rewindable operations such as delete
  * operations on the bitmaps.  The journaling code must keep a copy of
  * the buffer's contents prior to the undo_access call until such time
  * as we know that the buffer has definitely been committed to disk.
- * 
+ *
  * We never need to know which transaction the committed data is part
  * of, buffers touched here are guaranteed to be dirtied later and so
  * will be committed to a new transaction in due course, at which point
@@ -911,13 +911,13 @@ out:
 	return err;
 }
 
-/** 
+/**
  * int journal_dirty_data() -  mark a buffer as containing dirty data which
  *                             needs to be flushed before we can commit the
- *                             current transaction.  
+ *                             current transaction.
  * @handle: transaction
  * @bh: bufferhead to mark
- * 
+ *
  * The buffer is placed on the transaction's data list and is marked as
  * belonging to the transaction.
  *
@@ -946,15 +946,15 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
 
 	/*
 	 * What if the buffer is already part of a running transaction?
-	 * 
+	 *
 	 * There are two cases:
 	 * 1) It is part of the current running transaction.  Refile it,
 	 *    just in case we have allocated it as metadata, deallocated
-	 *    it, then reallocated it as data. 
+	 *    it, then reallocated it as data.
 	 * 2) It is part of the previous, still-committing transaction.
 	 *    If all we want to do is to guarantee that the buffer will be
 	 *    written to disk before this new transaction commits, then
-	 *    being sure that the *previous* transaction has this same 
+	 *    being sure that the *previous* transaction has this same
 	 *    property is sufficient for us!  Just leave it on its old
 	 *    transaction.
 	 *
@@ -1076,18 +1076,18 @@ no_journal:
 	return 0;
 }
 
-/** 
+/**
  * int journal_dirty_metadata() -  mark a buffer as containing dirty metadata
  * @handle: transaction to add buffer to.
- * @bh: buffer to mark 
- * 
+ * @bh: buffer to mark
+ *
  * mark dirty metadata which needs to be journaled as part of the current
  * transaction.
  *
  * The buffer is placed on the transaction's metadata list and is marked
- * as belonging to the transaction.  
+ * as belonging to the transaction.
  *
- * Returns error number or 0 on success.  
+ * Returns error number or 0 on success.
  *
  * Special care needs to be taken if the buffer already belongs to the
  * current committing transaction (in which case we should have frozen
@@ -1135,11 +1135,11 @@ int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 
 	set_buffer_jbddirty(bh);
 
-	/* 
+	/*
 	 * Metadata already on the current transaction list doesn't
 	 * need to be filed.  Metadata on another transaction's list must
 	 * be committing, and will be refiled once the commit completes:
-	 * leave it alone for now. 
+	 * leave it alone for now.
 	 */
 	if (jh->b_transaction != transaction) {
 		JBUFFER_TRACE(jh, "already on other transaction");
@@ -1165,7 +1165,7 @@ out:
 	return 0;
 }
 
-/* 
+/*
  * journal_release_buffer: undo a get_write_access without any buffer
  * updates, if the update decided in the end that it didn't need access.
  *
@@ -1176,20 +1176,20 @@ journal_release_buffer(handle_t *handle, struct buffer_head *bh)
 	BUFFER_TRACE(bh, "entry");
 }
 
-/** 
+/**
  * void journal_forget() - bforget() for potentially-journaled buffers.
  * @handle: transaction handle
  * @bh:     bh to 'forget'
  *
  * We can only do the bforget if there are no commits pending against the
  * buffer.  If the buffer is dirty in the current running transaction we
- * can safely unlink it. 
+ * can safely unlink it.
  *
  * bh may not be a journalled buffer at all - it may be a non-JBD
  * buffer which came off the hashtable.  Check for this.
  *
  * Decrements bh->b_count by one.
- * 
+ *
  * Allow this call even if the handle has aborted --- it may be part of
  * the caller's cleanup after an abort.
  */
@@ -1237,7 +1237,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
 
 		drop_reserve = 1;
 
-		/* 
+		/*
 		 * We are no longer going to journal this buffer.
 		 * However, the commit of this transaction is still
 		 * important to the buffer: the delete that we are now
@@ -1246,7 +1246,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
 		 *
 		 * So, if we have a checkpoint on the buffer, we should
 		 * now refile the buffer on our BJ_Forget list so that
-		 * we know to remove the checkpoint after we commit. 
+		 * we know to remove the checkpoint after we commit.
 		 */
 
 		if (jh->b_cp_transaction) {
@@ -1264,7 +1264,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
 			}
 		}
 	} else if (jh->b_transaction) {
-		J_ASSERT_JH(jh, (jh->b_transaction == 
+		J_ASSERT_JH(jh, (jh->b_transaction ==
 				 journal->j_committing_transaction));
 		/* However, if the buffer is still owned by a prior
 		 * (committing) transaction, we can't drop it yet... */
@@ -1294,7 +1294,7 @@ drop:
 /**
  * int journal_stop() - complete a transaction
  * @handle: tranaction to complete.
- * 
+ *
  * All done for a particular handle.
  *
  * There is not much action needed here.  We just return any remaining
@@ -1303,7 +1303,7 @@ drop:
  * filesystem is marked for synchronous update.
  *
  * journal_stop itself will not usually return an error, but it may
- * do so in unusual circumstances.  In particular, expect it to 
+ * do so in unusual circumstances.  In particular, expect it to
  * return -EIO if a journal_abort has been executed since the
  * transaction began.
  */
@@ -1314,13 +1314,14 @@ int journal_stop(handle_t *handle)
 	int old_handle_count, err;
 	pid_t pid;
 
-	J_ASSERT(transaction->t_updates > 0);
 	J_ASSERT(journal_current_handle() == handle);
 
 	if (is_handle_aborted(handle))
 		err = -EIO;
-	else
+	else {
+		J_ASSERT(transaction->t_updates > 0);
 		err = 0;
+	}
 
 	if (--handle->h_ref > 0) {
 		jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
@@ -1373,7 +1374,7 @@ int journal_stop(handle_t *handle)
 	if (handle->h_sync ||
 			transaction->t_outstanding_credits >
 				journal->j_max_transaction_buffers ||
-	    		time_after_eq(jiffies, transaction->t_expires)) {
+			time_after_eq(jiffies, transaction->t_expires)) {
 		/* Do this even for aborted journals: an abort still
 		 * completes the commit thread, it just doesn't write
 		 * anything to disk. */
@@ -1388,7 +1389,7 @@ int journal_stop(handle_t *handle)
 
 		/*
 		 * Special case: JFS_SYNC synchronous updates require us
-		 * to wait for the commit to complete.  
+		 * to wait for the commit to complete.
 		 */
 		if (handle->h_sync && !(current->flags & PF_MEMALLOC))
 			err = log_wait_commit(journal, tid);
@@ -1439,7 +1440,7 @@ int journal_force_commit(journal_t *journal)
  * jbd_lock_bh_state(jh2bh(jh)) is held.
  */
 
-static inline void 
+static inline void
 __blist_add_buffer(struct journal_head **list, struct journal_head *jh)
 {
 	if (!*list) {
@@ -1454,7 +1455,7 @@ __blist_add_buffer(struct journal_head **list, struct journal_head *jh)
 	}
 }
 
-/* 
+/*
  * Remove a buffer from a transaction list, given the transaction's list
  * head pointer.
  *
@@ -1475,7 +1476,7 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
 	jh->b_tnext->b_tprev = jh->b_tprev;
 }
 
-/* 
+/*
  * Remove a buffer from the appropriate transaction list.
  *
  * Note that this function can *change* the value of
@@ -1595,17 +1596,17 @@ out:
 }
 
 
-/** 
+/**
  * int journal_try_to_free_buffers() - try to free page buffers.
  * @journal: journal for operation
  * @page: to try and free
  * @unused_gfp_mask: unused
  *
- * 
+ *
  * For all the buffers on this page,
  * if they are fully written out ordered data, move them onto BUF_CLEAN
  * so try_to_free_buffers() can reap them.
- * 
+ *
  * This function returns non-zero if we wish try_to_free_buffers()
  * to be called. We do this if the page is releasable by try_to_free_buffers().
  * We also do it if the page has locked or dirty buffers and the caller wants
@@ -1629,7 +1630,7 @@ out:
  * cannot happen because we never reallocate freed data as metadata
  * while the data is part of a transaction.  Yes?
  */
-int journal_try_to_free_buffers(journal_t *journal, 
+int journal_try_to_free_buffers(journal_t *journal,
 				struct page *page, gfp_t unused_gfp_mask)
 {
 	struct buffer_head *head;
@@ -1697,7 +1698,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
 }
 
 /*
- * journal_invalidatepage 
+ * journal_invalidatepage
  *
  * This code is tricky.  It has a number of cases to deal with.
  *
@@ -1705,15 +1706,15 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
  *
  * i_size must be updated on disk before we start calling invalidatepage on the
  * data.
- * 
+ *
  *  This is done in ext3 by defining an ext3_setattr method which
  *  updates i_size before truncate gets going.  By maintaining this
  *  invariant, we can be sure that it is safe to throw away any buffers
  *  attached to the current transaction: once the transaction commits,
  *  we know that the data will not be needed.
- * 
+ *
  *  Note however that we can *not* throw away data belonging to the
- *  previous, committing transaction!  
+ *  previous, committing transaction!
  *
  * Any disk blocks which *are* part of the previous, committing
  * transaction (and which therefore cannot be discarded immediately) are
@@ -1732,7 +1733,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
  * don't make guarantees about the order in which data hits disk --- in
  * particular we don't guarantee that new dirty data is flushed before
  * transaction commit --- so it is always safe just to discard data
- * immediately in that mode.  --sct 
+ * immediately in that mode.  --sct
  */
 
 /*
@@ -1876,9 +1877,9 @@ zap_buffer_unlocked:
 	return may_free;
 }
 
-/** 
+/**
  * void journal_invalidatepage()
- * @journal: journal to use for flush... 
+ * @journal: journal to use for flush...
  * @page:    page to flush
  * @offset:  length of page to invalidate.
  *
@@ -1886,7 +1887,7 @@ zap_buffer_unlocked:
  *
  */
 void journal_invalidatepage(journal_t *journal,
-		      struct page *page, 
+		      struct page *page,
 		      unsigned long offset)
 {
 	struct buffer_head *head, *bh, *next;
@@ -1908,7 +1909,7 @@ void journal_invalidatepage(journal_t *journal,
 		next = bh->b_this_page;
 
 		if (offset <= curr_off) {
-		 	/* This block is wholly outside the truncation point */
+			/* This block is wholly outside the truncation point */
 			lock_buffer(bh);
 			may_free &= journal_unmap_buffer(journal, bh);
 			unlock_buffer(bh);
@@ -1924,8 +1925,8 @@ void journal_invalidatepage(journal_t *journal,
 	}
 }
 
-/* 
- * File a buffer on the given transaction list. 
+/*
+ * File a buffer on the given transaction list.
  */
 void __journal_file_buffer(struct journal_head *jh,
 			transaction_t *transaction, int jlist)
@@ -1948,7 +1949,7 @@ void __journal_file_buffer(struct journal_head *jh,
 	 * with __jbd_unexpected_dirty_buffer()'s handling of dirty
 	 * state. */
 
-	if (jlist == BJ_Metadata || jlist == BJ_Reserved || 
+	if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
 	    jlist == BJ_Shadow || jlist == BJ_Forget) {
 		if (test_clear_buffer_dirty(bh) ||
 		    test_clear_buffer_jbddirty(bh))
@@ -2008,7 +2009,7 @@ void journal_file_buffer(struct journal_head *jh,
 	jbd_unlock_bh_state(jh2bh(jh));
 }
 
-/* 
+/*
  * Remove a buffer from its current buffer list in preparation for
  * dropping it from its current transaction entirely.  If the buffer has
  * already started to be used by a subsequent transaction, refile the
@@ -2060,7 +2061,7 @@ void __journal_refile_buffer(struct journal_head *jh)
  * to the caller to remove the journal_head if necessary.  For the
  * unlocked journal_refile_buffer call, the caller isn't going to be
  * doing anything else to the buffer so we need to do the cleanup
- * ourselves to avoid a jh leak. 
+ * ourselves to avoid a jh leak.
  *
  * *** The journal_head may be freed by this call! ***
  */
diff --git a/fs/jbd2/Makefile b/fs/jbd2/Makefile
new file mode 100644
index 000000000000..802a3413872a
--- /dev/null
+++ b/fs/jbd2/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux journaling routines.
+#
+
+obj-$(CONFIG_JBD2) += jbd2.o
+
+jbd2-objs := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
new file mode 100644
index 000000000000..68039fa9a566
--- /dev/null
+++ b/fs/jbd2/checkpoint.c
@@ -0,0 +1,697 @@
+/*
+ * linux/fs/checkpoint.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1999 Red Hat Software --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Checkpoint routines for the generic filesystem journaling code.
+ * Part of the ext2fs journaling system.
+ *
+ * Checkpointing is the process of ensuring that a section of the log is
+ * committed fully to disk, so that that portion of the log can be
+ * reused.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+
+/*
+ * Unlink a buffer from a transaction checkpoint list.
+ *
+ * Called with j_list_lock held.
+ */
+static inline void __buffer_unlink_first(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	jh->b_cpnext->b_cpprev = jh->b_cpprev;
+	jh->b_cpprev->b_cpnext = jh->b_cpnext;
+	if (transaction->t_checkpoint_list == jh) {
+		transaction->t_checkpoint_list = jh->b_cpnext;
+		if (transaction->t_checkpoint_list == jh)
+			transaction->t_checkpoint_list = NULL;
+	}
+}
+
+/*
+ * Unlink a buffer from a transaction checkpoint(io) list.
+ *
+ * Called with j_list_lock held.
+ */
+static inline void __buffer_unlink(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+	if (transaction->t_checkpoint_io_list == jh) {
+		transaction->t_checkpoint_io_list = jh->b_cpnext;
+		if (transaction->t_checkpoint_io_list == jh)
+			transaction->t_checkpoint_io_list = NULL;
+	}
+}
+
+/*
+ * Move a buffer from the checkpoint list to the checkpoint io list
+ *
+ * Called with j_list_lock held
+ */
+static inline void __buffer_relink_io(struct journal_head *jh)
+{
+	transaction_t *transaction = jh->b_cp_transaction;
+
+	__buffer_unlink_first(jh);
+
+	if (!transaction->t_checkpoint_io_list) {
+		jh->b_cpnext = jh->b_cpprev = jh;
+	} else {
+		jh->b_cpnext = transaction->t_checkpoint_io_list;
+		jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
+		jh->b_cpprev->b_cpnext = jh;
+		jh->b_cpnext->b_cpprev = jh;
+	}
+	transaction->t_checkpoint_io_list = jh;
+}
+
+/*
+ * Try to release a checkpointed buffer from its transaction.
+ * Returns 1 if we released it and 2 if we also released the
+ * whole transaction.
+ *
+ * Requires j_list_lock
+ * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
+ */
+static int __try_to_free_cp_buf(struct journal_head *jh)
+{
+	int ret = 0;
+	struct buffer_head *bh = jh2bh(jh);
+
+	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
+		JBUFFER_TRACE(jh, "remove from checkpoint list");
+		ret = __jbd2_journal_remove_checkpoint(jh) + 1;
+		jbd_unlock_bh_state(bh);
+		jbd2_journal_remove_journal_head(bh);
+		BUFFER_TRACE(bh, "release");
+		__brelse(bh);
+	} else {
+		jbd_unlock_bh_state(bh);
+	}
+	return ret;
+}
+
+/*
+ * __jbd2_log_wait_for_space: wait until there is space in the journal.
+ *
+ * Called under j-state_lock *only*.  It will be unlocked if we have to wait
+ * for a checkpoint to free up some space in the log.
+ */
+void __jbd2_log_wait_for_space(journal_t *journal)
+{
+	int nblocks;
+	assert_spin_locked(&journal->j_state_lock);
+
+	nblocks = jbd_space_needed(journal);
+	while (__jbd2_log_space_left(journal) < nblocks) {
+		if (journal->j_flags & JBD2_ABORT)
+			return;
+		spin_unlock(&journal->j_state_lock);
+		mutex_lock(&journal->j_checkpoint_mutex);
+
+		/*
+		 * Test again, another process may have checkpointed while we
+		 * were waiting for the checkpoint lock
+		 */
+		spin_lock(&journal->j_state_lock);
+		nblocks = jbd_space_needed(journal);
+		if (__jbd2_log_space_left(journal) < nblocks) {
+			spin_unlock(&journal->j_state_lock);
+			jbd2_log_do_checkpoint(journal);
+			spin_lock(&journal->j_state_lock);
+		}
+		mutex_unlock(&journal->j_checkpoint_mutex);
+	}
+}
+
+/*
+ * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
+ * The caller must restart a list walk.  Wait for someone else to run
+ * jbd_unlock_bh_state().
+ */
+static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
+	__releases(journal->j_list_lock)
+{
+	get_bh(bh);
+	spin_unlock(&journal->j_list_lock);
+	jbd_lock_bh_state(bh);
+	jbd_unlock_bh_state(bh);
+	put_bh(bh);
+}
+
+/*
+ * Clean up transaction's list of buffers submitted for io.
+ * We wait for any pending IO to complete and remove any clean
+ * buffers. Note that we take the buffers in the opposite ordering
+ * from the one in which they were submitted for IO.
+ *
+ * Called with j_list_lock held.
+ */
+static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
+{
+	struct journal_head *jh;
+	struct buffer_head *bh;
+	tid_t this_tid;
+	int released = 0;
+
+	this_tid = transaction->t_tid;
+restart:
+	/* Did somebody clean up the transaction in the meanwhile? */
+	if (journal->j_checkpoint_transactions != transaction ||
+			transaction->t_tid != this_tid)
+		return;
+	while (!released && transaction->t_checkpoint_io_list) {
+		jh = transaction->t_checkpoint_io_list;
+		bh = jh2bh(jh);
+		if (!jbd_trylock_bh_state(bh)) {
+			jbd_sync_bh(journal, bh);
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
+		if (buffer_locked(bh)) {
+			atomic_inc(&bh->b_count);
+			spin_unlock(&journal->j_list_lock);
+			jbd_unlock_bh_state(bh);
+			wait_on_buffer(bh);
+			/* the journal_head may have gone by now */
+			BUFFER_TRACE(bh, "brelse");
+			__brelse(bh);
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
+		/*
+		 * Now in whatever state the buffer currently is, we know that
+		 * it has been written out and so we can drop it from the list
+		 */
+		released = __jbd2_journal_remove_checkpoint(jh);
+		jbd_unlock_bh_state(bh);
+		jbd2_journal_remove_journal_head(bh);
+		__brelse(bh);
+	}
+}
+
+#define NR_BATCH	64
+
+static void
+__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
+{
+	int i;
+
+	ll_rw_block(SWRITE, *batch_count, bhs);
+	for (i = 0; i < *batch_count; i++) {
+		struct buffer_head *bh = bhs[i];
+		clear_buffer_jwrite(bh);
+		BUFFER_TRACE(bh, "brelse");
+		__brelse(bh);
+	}
+	*batch_count = 0;
+}
+
+/*
+ * Try to flush one buffer from the checkpoint list to disk.
+ *
+ * Return 1 if something happened which requires us to abort the current
+ * scan of the checkpoint list.
+ *
+ * Called with j_list_lock held and drops it if 1 is returned
+ * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
+ */
+static int __process_buffer(journal_t *journal, struct journal_head *jh,
+			struct buffer_head **bhs, int *batch_count)
+{
+	struct buffer_head *bh = jh2bh(jh);
+	int ret = 0;
+
+	if (buffer_locked(bh)) {
+		atomic_inc(&bh->b_count);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		wait_on_buffer(bh);
+		/* the journal_head may have gone by now */
+		BUFFER_TRACE(bh, "brelse");
+		__brelse(bh);
+		ret = 1;
+	} else if (jh->b_transaction != NULL) {
+		transaction_t *t = jh->b_transaction;
+		tid_t tid = t->t_tid;
+
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		jbd2_log_start_commit(journal, tid);
+		jbd2_log_wait_commit(journal, tid);
+		ret = 1;
+	} else if (!buffer_dirty(bh)) {
+		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
+		BUFFER_TRACE(bh, "remove from checkpoint");
+		__jbd2_journal_remove_checkpoint(jh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		jbd2_journal_remove_journal_head(bh);
+		__brelse(bh);
+		ret = 1;
+	} else {
+		/*
+		 * Important: we are about to write the buffer, and
+		 * possibly block, while still holding the journal lock.
+		 * We cannot afford to let the transaction logic start
+		 * messing around with this buffer before we write it to
+		 * disk, as that would break recoverability.
+		 */
+		BUFFER_TRACE(bh, "queue");
+		get_bh(bh);
+		J_ASSERT_BH(bh, !buffer_jwrite(bh));
+		set_buffer_jwrite(bh);
+		bhs[*batch_count] = bh;
+		__buffer_relink_io(jh);
+		jbd_unlock_bh_state(bh);
+		(*batch_count)++;
+		if (*batch_count == NR_BATCH) {
+			spin_unlock(&journal->j_list_lock);
+			__flush_batch(journal, bhs, batch_count);
+			ret = 1;
+		}
+	}
+	return ret;
+}
+
+/*
+ * Perform an actual checkpoint. We take the first transaction on the
+ * list of transactions to be checkpointed and send all its buffers
+ * to disk. We submit larger chunks of data at once.
+ *
+ * The journal should be locked before calling this function.
+ */
+int jbd2_log_do_checkpoint(journal_t *journal)
+{
+	transaction_t *transaction;
+	tid_t this_tid;
+	int result;
+
+	jbd_debug(1, "Start checkpoint\n");
+
+	/*
+	 * First thing: if there are any transactions in the log which
+	 * don't need checkpointing, just eliminate them from the
+	 * journal straight away.
+	 */
+	result = jbd2_cleanup_journal_tail(journal);
+	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
+	if (result <= 0)
+		return result;
+
+	/*
+	 * OK, we need to start writing disk blocks.  Take one transaction
+	 * and write it.
+	 */
+	spin_lock(&journal->j_list_lock);
+	if (!journal->j_checkpoint_transactions)
+		goto out;
+	transaction = journal->j_checkpoint_transactions;
+	this_tid = transaction->t_tid;
+restart:
+	/*
+	 * If someone cleaned up this transaction while we slept, we're
+	 * done (maybe it's a new transaction, but it fell at the same
+	 * address).
+	 */
+	if (journal->j_checkpoint_transactions == transaction &&
+			transaction->t_tid == this_tid) {
+		int batch_count = 0;
+		struct buffer_head *bhs[NR_BATCH];
+		struct journal_head *jh;
+		int retry = 0;
+
+		while (!retry && transaction->t_checkpoint_list) {
+			struct buffer_head *bh;
+
+			jh = transaction->t_checkpoint_list;
+			bh = jh2bh(jh);
+			if (!jbd_trylock_bh_state(bh)) {
+				jbd_sync_bh(journal, bh);
+				retry = 1;
+				break;
+			}
+			retry = __process_buffer(journal, jh, bhs,&batch_count);
+			if (!retry && lock_need_resched(&journal->j_list_lock)){
+				spin_unlock(&journal->j_list_lock);
+				retry = 1;
+				break;
+			}
+		}
+
+		if (batch_count) {
+			if (!retry) {
+				spin_unlock(&journal->j_list_lock);
+				retry = 1;
+			}
+			__flush_batch(journal, bhs, &batch_count);
+		}
+
+		if (retry) {
+			spin_lock(&journal->j_list_lock);
+			goto restart;
+		}
+		/*
+		 * Now we have cleaned up the first transaction's checkpoint
+		 * list. Let's clean up the second one
+		 */
+		__wait_cp_io(journal, transaction);
+	}
+out:
+	spin_unlock(&journal->j_list_lock);
+	result = jbd2_cleanup_journal_tail(journal);
+	if (result < 0)
+		return result;
+	return 0;
+}
+
+/*
+ * Check the list of checkpoint transactions for the journal to see if
+ * we have already got rid of any since the last update of the log tail
+ * in the journal superblock.  If so, we can instantly roll the
+ * superblock forward to remove those transactions from the log.
+ *
+ * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
+ *
+ * Called with the journal lock held.
+ *
+ * This is the only part of the journaling code which really needs to be
+ * aware of transaction aborts.  Checkpointing involves writing to the
+ * main filesystem area rather than to the journal, so it can proceed
+ * even in abort state, but we must not update the journal superblock if
+ * we have an abort error outstanding.
+ */
+
+int jbd2_cleanup_journal_tail(journal_t *journal)
+{
+	transaction_t * transaction;
+	tid_t		first_tid;
+	unsigned long	blocknr, freed;
+
+	/* OK, work out the oldest transaction remaining in the log, and
+	 * the log block it starts at.
+	 *
+	 * If the log is now empty, we need to work out which is the
+	 * next transaction ID we will write, and where it will
+	 * start. */
+
+	spin_lock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	transaction = journal->j_checkpoint_transactions;
+	if (transaction) {
+		first_tid = transaction->t_tid;
+		blocknr = transaction->t_log_start;
+	} else if ((transaction = journal->j_committing_transaction) != NULL) {
+		first_tid = transaction->t_tid;
+		blocknr = transaction->t_log_start;
+	} else if ((transaction = journal->j_running_transaction) != NULL) {
+		first_tid = transaction->t_tid;
+		blocknr = journal->j_head;
+	} else {
+		first_tid = journal->j_transaction_sequence;
+		blocknr = journal->j_head;
+	}
+	spin_unlock(&journal->j_list_lock);
+	J_ASSERT(blocknr != 0);
+
+	/* If the oldest pinned transaction is at the tail of the log
+           already then there's not much we can do right now. */
+	if (journal->j_tail_sequence == first_tid) {
+		spin_unlock(&journal->j_state_lock);
+		return 1;
+	}
+
+	/* OK, update the superblock to recover the freed space.
+	 * Physical blocks come first: have we wrapped beyond the end of
+	 * the log?  */
+	freed = blocknr - journal->j_tail;
+	if (blocknr < journal->j_tail)
+		freed = freed + journal->j_last - journal->j_first;
+
+	jbd_debug(1,
+		  "Cleaning journal tail from %d to %d (offset %lu), "
+		  "freeing %lu\n",
+		  journal->j_tail_sequence, first_tid, blocknr, freed);
+
+	journal->j_free += freed;
+	journal->j_tail_sequence = first_tid;
+	journal->j_tail = blocknr;
+	spin_unlock(&journal->j_state_lock);
+	if (!(journal->j_flags & JBD2_ABORT))
+		jbd2_journal_update_superblock(journal, 1);
+	return 0;
+}
+
+
+/* Checkpoint list management */
+
+/*
+ * journal_clean_one_cp_list
+ *
+ * Find all the written-back checkpoint buffers in the given list and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of bufers reaped (for debug)
+ */
+
+static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
+{
+	struct journal_head *last_jh;
+	struct journal_head *next_jh = jh;
+	int ret, freed = 0;
+
+	*released = 0;
+	if (!jh)
+		return 0;
+
+	last_jh = jh->b_cpprev;
+	do {
+		jh = next_jh;
+		next_jh = jh->b_cpnext;
+		/* Use trylock because of the ranking */
+		if (jbd_trylock_bh_state(jh2bh(jh))) {
+			ret = __try_to_free_cp_buf(jh);
+			if (ret) {
+				freed++;
+				if (ret == 2) {
+					*released = 1;
+					return freed;
+				}
+			}
+		}
+		/*
+		 * This function only frees up some memory
+		 * if possible so we dont have an obligation
+		 * to finish processing. Bail out if preemption
+		 * requested:
+		 */
+		if (need_resched())
+			return freed;
+	} while (jh != last_jh);
+
+	return freed;
+}
+
+/*
+ * journal_clean_checkpoint_list
+ *
+ * Find all the written-back checkpoint buffers in the journal and release them.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ * Returns number of buffers reaped (for debug)
+ */
+
+int __jbd2_journal_clean_checkpoint_list(journal_t *journal)
+{
+	transaction_t *transaction, *last_transaction, *next_transaction;
+	int ret = 0;
+	int released;
+
+	transaction = journal->j_checkpoint_transactions;
+	if (!transaction)
+		goto out;
+
+	last_transaction = transaction->t_cpprev;
+	next_transaction = transaction;
+	do {
+		transaction = next_transaction;
+		next_transaction = transaction->t_cpnext;
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_list, &released);
+		/*
+		 * This function only frees up some memory if possible so we
+		 * dont have an obligation to finish processing. Bail out if
+		 * preemption requested:
+		 */
+		if (need_resched())
+			goto out;
+		if (released)
+			continue;
+		/*
+		 * It is essential that we are as careful as in the case of
+		 * t_checkpoint_list with removing the buffer from the list as
+		 * we can possibly see not yet submitted buffers on io_list
+		 */
+		ret += journal_clean_one_cp_list(transaction->
+				t_checkpoint_io_list, &released);
+		if (need_resched())
+			goto out;
+	} while (transaction != last_transaction);
+out:
+	return ret;
+}
+
+/*
+ * journal_remove_checkpoint: called after a buffer has been committed
+ * to disk (either by being write-back flushed to disk, or being
+ * committed to the log).
+ *
+ * We cannot safely clean a transaction out of the log until all of the
+ * buffer updates committed in that transaction have safely been stored
+ * elsewhere on disk.  To achieve this, all of the buffers in a
+ * transaction need to be maintained on the transaction's checkpoint
+ * lists until they have been rewritten, at which point this function is
+ * called to remove the buffer from the existing transaction's
+ * checkpoint lists.
+ *
+ * The function returns 1 if it frees the transaction, 0 otherwise.
+ *
+ * This function is called with the journal locked.
+ * This function is called with j_list_lock held.
+ * This function is called with jbd_lock_bh_state(jh2bh(jh))
+ */
+
+int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
+{
+	transaction_t *transaction;
+	journal_t *journal;
+	int ret = 0;
+
+	JBUFFER_TRACE(jh, "entry");
+
+	if ((transaction = jh->b_cp_transaction) == NULL) {
+		JBUFFER_TRACE(jh, "not on transaction");
+		goto out;
+	}
+	journal = transaction->t_journal;
+
+	__buffer_unlink(jh);
+	jh->b_cp_transaction = NULL;
+
+	if (transaction->t_checkpoint_list != NULL ||
+	    transaction->t_checkpoint_io_list != NULL)
+		goto out;
+	JBUFFER_TRACE(jh, "transaction has no more buffers");
+
+	/*
+	 * There is one special case to worry about: if we have just pulled the
+	 * buffer off a committing transaction's forget list, then even if the
+	 * checkpoint list is empty, the transaction obviously cannot be
+	 * dropped!
+	 *
+	 * The locking here around j_committing_transaction is a bit sleazy.
+	 * See the comment at the end of jbd2_journal_commit_transaction().
+	 */
+	if (transaction == journal->j_committing_transaction) {
+		JBUFFER_TRACE(jh, "belongs to committing transaction");
+		goto out;
+	}
+
+	/* OK, that was the last buffer for the transaction: we can now
+	   safely remove this transaction from the log */
+
+	__jbd2_journal_drop_transaction(journal, transaction);
+
+	/* Just in case anybody was waiting for more transactions to be
+           checkpointed... */
+	wake_up(&journal->j_wait_logspace);
+	ret = 1;
+out:
+	JBUFFER_TRACE(jh, "exit");
+	return ret;
+}
+
+/*
+ * journal_insert_checkpoint: put a committed buffer onto a checkpoint
+ * list so that we know when it is safe to clean the transaction out of
+ * the log.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ */
+void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
+			       transaction_t *transaction)
+{
+	JBUFFER_TRACE(jh, "entry");
+	J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh)));
+	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+
+	jh->b_cp_transaction = transaction;
+
+	if (!transaction->t_checkpoint_list) {
+		jh->b_cpnext = jh->b_cpprev = jh;
+	} else {
+		jh->b_cpnext = transaction->t_checkpoint_list;
+		jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
+		jh->b_cpprev->b_cpnext = jh;
+		jh->b_cpnext->b_cpprev = jh;
+	}
+	transaction->t_checkpoint_list = jh;
+}
+
+/*
+ * We've finished with this transaction structure: adios...
+ *
+ * The transaction must have no links except for the checkpoint by this
+ * point.
+ *
+ * Called with the journal locked.
+ * Called with j_list_lock held.
+ */
+
+void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction)
+{
+	assert_spin_locked(&journal->j_list_lock);
+	if (transaction->t_cpnext) {
+		transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
+		transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
+		if (journal->j_checkpoint_transactions == transaction)
+			journal->j_checkpoint_transactions =
+				transaction->t_cpnext;
+		if (journal->j_checkpoint_transactions == transaction)
+			journal->j_checkpoint_transactions = NULL;
+	}
+
+	J_ASSERT(transaction->t_state == T_FINISHED);
+	J_ASSERT(transaction->t_buffers == NULL);
+	J_ASSERT(transaction->t_sync_datalist == NULL);
+	J_ASSERT(transaction->t_forget == NULL);
+	J_ASSERT(transaction->t_iobuf_list == NULL);
+	J_ASSERT(transaction->t_shadow_list == NULL);
+	J_ASSERT(transaction->t_log_list == NULL);
+	J_ASSERT(transaction->t_checkpoint_list == NULL);
+	J_ASSERT(transaction->t_checkpoint_io_list == NULL);
+	J_ASSERT(transaction->t_updates == 0);
+	J_ASSERT(journal->j_committing_transaction != transaction);
+	J_ASSERT(journal->j_running_transaction != transaction);
+
+	jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
+	kfree(transaction);
+}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
new file mode 100644
index 000000000000..70b2ae1ef281
--- /dev/null
+++ b/fs/jbd2/commit.c
@@ -0,0 +1,920 @@
+/*
+ * linux/fs/jbd2/commit.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal commit routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+
+/*
+ * Default IO end handler for temporary BJ_IO buffer_heads.
+ */
+static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+	BUFFER_TRACE(bh, "");
+	if (uptodate)
+		set_buffer_uptodate(bh);
+	else
+		clear_buffer_uptodate(bh);
+	unlock_buffer(bh);
+}
+
+/*
+ * When an ext3-ordered file is truncated, it is possible that many pages are
+ * not sucessfully freed, because they are attached to a committing transaction.
+ * After the transaction commits, these pages are left on the LRU, with no
+ * ->mapping, and with attached buffers.  These pages are trivially reclaimable
+ * by the VM, but their apparent absence upsets the VM accounting, and it makes
+ * the numbers in /proc/meminfo look odd.
+ *
+ * So here, we have a buffer which has just come off the forget list.  Look to
+ * see if we can strip all buffers from the backing page.
+ *
+ * Called under lock_journal(), and possibly under journal_datalist_lock.  The
+ * caller provided us with a ref against the buffer, and we drop that here.
+ */
+static void release_buffer_page(struct buffer_head *bh)
+{
+	struct page *page;
+
+	if (buffer_dirty(bh))
+		goto nope;
+	if (atomic_read(&bh->b_count) != 1)
+		goto nope;
+	page = bh->b_page;
+	if (!page)
+		goto nope;
+	if (page->mapping)
+		goto nope;
+
+	/* OK, it's a truncated page */
+	if (TestSetPageLocked(page))
+		goto nope;
+
+	page_cache_get(page);
+	__brelse(bh);
+	try_to_free_buffers(page);
+	unlock_page(page);
+	page_cache_release(page);
+	return;
+
+nope:
+	__brelse(bh);
+}
+
+/*
+ * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
+ * held.  For ranking reasons we must trylock.  If we lose, schedule away and
+ * return 0.  j_list_lock is dropped in this case.
+ */
+static int inverted_lock(journal_t *journal, struct buffer_head *bh)
+{
+	if (!jbd_trylock_bh_state(bh)) {
+		spin_unlock(&journal->j_list_lock);
+		schedule();
+		return 0;
+	}
+	return 1;
+}
+
+/* Done it all: now write the commit record.  We should have
+ * cleaned up our previous buffers by now, so if we are in abort
+ * mode we can now just skip the rest of the journal write
+ * entirely.
+ *
+ * Returns 1 if the journal needs to be aborted or 0 on success
+ */
+static int journal_write_commit_record(journal_t *journal,
+					transaction_t *commit_transaction)
+{
+	struct journal_head *descriptor;
+	struct buffer_head *bh;
+	int i, ret;
+	int barrier_done = 0;
+
+	if (is_journal_aborted(journal))
+		return 0;
+
+	descriptor = jbd2_journal_get_descriptor_buffer(journal);
+	if (!descriptor)
+		return 1;
+
+	bh = jh2bh(descriptor);
+
+	/* AKPM: buglet - add `i' to tmp! */
+	for (i = 0; i < bh->b_size; i += 512) {
+		journal_header_t *tmp = (journal_header_t*)bh->b_data;
+		tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
+		tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
+		tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+	}
+
+	JBUFFER_TRACE(descriptor, "write commit block");
+	set_buffer_dirty(bh);
+	if (journal->j_flags & JBD2_BARRIER) {
+		set_buffer_ordered(bh);
+		barrier_done = 1;
+	}
+	ret = sync_dirty_buffer(bh);
+	/* is it possible for another commit to fail at roughly
+	 * the same time as this one?  If so, we don't want to
+	 * trust the barrier flag in the super, but instead want
+	 * to remember if we sent a barrier request
+	 */
+	if (ret == -EOPNOTSUPP && barrier_done) {
+		char b[BDEVNAME_SIZE];
+
+		printk(KERN_WARNING
+			"JBD: barrier-based sync failed on %s - "
+			"disabling barriers\n",
+			bdevname(journal->j_dev, b));
+		spin_lock(&journal->j_state_lock);
+		journal->j_flags &= ~JBD2_BARRIER;
+		spin_unlock(&journal->j_state_lock);
+
+		/* And try again, without the barrier */
+		clear_buffer_ordered(bh);
+		set_buffer_uptodate(bh);
+		set_buffer_dirty(bh);
+		ret = sync_dirty_buffer(bh);
+	}
+	put_bh(bh);		/* One for getblk() */
+	jbd2_journal_put_journal_head(descriptor);
+
+	return (ret == -EIO);
+}
+
+static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+{
+	int i;
+
+	for (i = 0; i < bufs; i++) {
+		wbuf[i]->b_end_io = end_buffer_write_sync;
+		/* We use-up our safety reference in submit_bh() */
+		submit_bh(WRITE, wbuf[i]);
+	}
+}
+
+/*
+ *  Submit all the data buffers to disk
+ */
+static void journal_submit_data_buffers(journal_t *journal,
+				transaction_t *commit_transaction)
+{
+	struct journal_head *jh;
+	struct buffer_head *bh;
+	int locked;
+	int bufs = 0;
+	struct buffer_head **wbuf = journal->j_wbuf;
+
+	/*
+	 * Whenever we unlock the journal and sleep, things can get added
+	 * onto ->t_sync_datalist, so we have to keep looping back to
+	 * write_out_data until we *know* that the list is empty.
+	 *
+	 * Cleanup any flushed data buffers from the data list.  Even in
+	 * abort mode, we want to flush this out as soon as possible.
+	 */
+write_out_data:
+	cond_resched();
+	spin_lock(&journal->j_list_lock);
+
+	while (commit_transaction->t_sync_datalist) {
+		jh = commit_transaction->t_sync_datalist;
+		bh = jh2bh(jh);
+		locked = 0;
+
+		/* Get reference just to make sure buffer does not disappear
+		 * when we are forced to drop various locks */
+		get_bh(bh);
+		/* If the buffer is dirty, we need to submit IO and hence
+		 * we need the buffer lock. We try to lock the buffer without
+		 * blocking. If we fail, we need to drop j_list_lock and do
+		 * blocking lock_buffer().
+		 */
+		if (buffer_dirty(bh)) {
+			if (test_set_buffer_locked(bh)) {
+				BUFFER_TRACE(bh, "needs blocking lock");
+				spin_unlock(&journal->j_list_lock);
+				/* Write out all data to prevent deadlocks */
+				journal_do_submit_data(wbuf, bufs);
+				bufs = 0;
+				lock_buffer(bh);
+				spin_lock(&journal->j_list_lock);
+			}
+			locked = 1;
+		}
+		/* We have to get bh_state lock. Again out of order, sigh. */
+		if (!inverted_lock(journal, bh)) {
+			jbd_lock_bh_state(bh);
+			spin_lock(&journal->j_list_lock);
+		}
+		/* Someone already cleaned up the buffer? */
+		if (!buffer_jbd(bh)
+			|| jh->b_transaction != commit_transaction
+			|| jh->b_jlist != BJ_SyncData) {
+			jbd_unlock_bh_state(bh);
+			if (locked)
+				unlock_buffer(bh);
+			BUFFER_TRACE(bh, "already cleaned up");
+			put_bh(bh);
+			continue;
+		}
+		if (locked && test_clear_buffer_dirty(bh)) {
+			BUFFER_TRACE(bh, "needs writeout, adding to array");
+			wbuf[bufs++] = bh;
+			__jbd2_journal_file_buffer(jh, commit_transaction,
+						BJ_Locked);
+			jbd_unlock_bh_state(bh);
+			if (bufs == journal->j_wbufsize) {
+				spin_unlock(&journal->j_list_lock);
+				journal_do_submit_data(wbuf, bufs);
+				bufs = 0;
+				goto write_out_data;
+			}
+		}
+		else {
+			BUFFER_TRACE(bh, "writeout complete: unfile");
+			__jbd2_journal_unfile_buffer(jh);
+			jbd_unlock_bh_state(bh);
+			if (locked)
+				unlock_buffer(bh);
+			jbd2_journal_remove_journal_head(bh);
+			/* Once for our safety reference, once for
+			 * jbd2_journal_remove_journal_head() */
+			put_bh(bh);
+			put_bh(bh);
+		}
+
+		if (lock_need_resched(&journal->j_list_lock)) {
+			spin_unlock(&journal->j_list_lock);
+			goto write_out_data;
+		}
+	}
+	spin_unlock(&journal->j_list_lock);
+	journal_do_submit_data(wbuf, bufs);
+}
+
+static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
+				   unsigned long long block)
+{
+	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
+	if (tag_bytes > JBD_TAG_SIZE32)
+		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
+}
+
+/*
+ * jbd2_journal_commit_transaction
+ *
+ * The primary function for committing a transaction to the log.  This
+ * function is called by the journal thread to begin a complete commit.
+ */
+void jbd2_journal_commit_transaction(journal_t *journal)
+{
+	transaction_t *commit_transaction;
+	struct journal_head *jh, *new_jh, *descriptor;
+	struct buffer_head **wbuf = journal->j_wbuf;
+	int bufs;
+	int flags;
+	int err;
+	unsigned long long blocknr;
+	char *tagp = NULL;
+	journal_header_t *header;
+	journal_block_tag_t *tag = NULL;
+	int space_left = 0;
+	int first_tag = 0;
+	int tag_flag;
+	int i;
+	int tag_bytes = journal_tag_bytes(journal);
+
+	/*
+	 * First job: lock down the current transaction and wait for
+	 * all outstanding updates to complete.
+	 */
+
+#ifdef COMMIT_STATS
+	spin_lock(&journal->j_list_lock);
+	summarise_journal_usage(journal);
+	spin_unlock(&journal->j_list_lock);
+#endif
+
+	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
+	if (journal->j_flags & JBD2_FLUSHED) {
+		jbd_debug(3, "super block updated\n");
+		jbd2_journal_update_superblock(journal, 1);
+	} else {
+		jbd_debug(3, "superblock not updated\n");
+	}
+
+	J_ASSERT(journal->j_running_transaction != NULL);
+	J_ASSERT(journal->j_committing_transaction == NULL);
+
+	commit_transaction = journal->j_running_transaction;
+	J_ASSERT(commit_transaction->t_state == T_RUNNING);
+
+	jbd_debug(1, "JBD: starting commit of transaction %d\n",
+			commit_transaction->t_tid);
+
+	spin_lock(&journal->j_state_lock);
+	commit_transaction->t_state = T_LOCKED;
+
+	spin_lock(&commit_transaction->t_handle_lock);
+	while (commit_transaction->t_updates) {
+		DEFINE_WAIT(wait);
+
+		prepare_to_wait(&journal->j_wait_updates, &wait,
+					TASK_UNINTERRUPTIBLE);
+		if (commit_transaction->t_updates) {
+			spin_unlock(&commit_transaction->t_handle_lock);
+			spin_unlock(&journal->j_state_lock);
+			schedule();
+			spin_lock(&journal->j_state_lock);
+			spin_lock(&commit_transaction->t_handle_lock);
+		}
+		finish_wait(&journal->j_wait_updates, &wait);
+	}
+	spin_unlock(&commit_transaction->t_handle_lock);
+
+	J_ASSERT (commit_transaction->t_outstanding_credits <=
+			journal->j_max_transaction_buffers);
+
+	/*
+	 * First thing we are allowed to do is to discard any remaining
+	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
+	 * that there are no such buffers: if a large filesystem
+	 * operation like a truncate needs to split itself over multiple
+	 * transactions, then it may try to do a jbd2_journal_restart() while
+	 * there are still BJ_Reserved buffers outstanding.  These must
+	 * be released cleanly from the current transaction.
+	 *
+	 * In this case, the filesystem must still reserve write access
+	 * again before modifying the buffer in the new transaction, but
+	 * we do not require it to remember exactly which old buffers it
+	 * has reserved.  This is consistent with the existing behaviour
+	 * that multiple jbd2_journal_get_write_access() calls to the same
+	 * buffer are perfectly permissable.
+	 */
+	while (commit_transaction->t_reserved_list) {
+		jh = commit_transaction->t_reserved_list;
+		JBUFFER_TRACE(jh, "reserved, unused: refile");
+		/*
+		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
+		 * leave undo-committed data.
+		 */
+		if (jh->b_committed_data) {
+			struct buffer_head *bh = jh2bh(jh);
+
+			jbd_lock_bh_state(bh);
+			jbd2_slab_free(jh->b_committed_data, bh->b_size);
+			jh->b_committed_data = NULL;
+			jbd_unlock_bh_state(bh);
+		}
+		jbd2_journal_refile_buffer(journal, jh);
+	}
+
+	/*
+	 * Now try to drop any written-back buffers from the journal's
+	 * checkpoint lists.  We do this *before* commit because it potentially
+	 * frees some memory
+	 */
+	spin_lock(&journal->j_list_lock);
+	__jbd2_journal_clean_checkpoint_list(journal);
+	spin_unlock(&journal->j_list_lock);
+
+	jbd_debug (3, "JBD: commit phase 1\n");
+
+	/*
+	 * Switch to a new revoke table.
+	 */
+	jbd2_journal_switch_revoke_table(journal);
+
+	commit_transaction->t_state = T_FLUSH;
+	journal->j_committing_transaction = commit_transaction;
+	journal->j_running_transaction = NULL;
+	commit_transaction->t_log_start = journal->j_head;
+	wake_up(&journal->j_wait_transaction_locked);
+	spin_unlock(&journal->j_state_lock);
+
+	jbd_debug (3, "JBD: commit phase 2\n");
+
+	/*
+	 * First, drop modified flag: all accesses to the buffers
+	 * will be tracked for a new trasaction only -bzzz
+	 */
+	spin_lock(&journal->j_list_lock);
+	if (commit_transaction->t_buffers) {
+		new_jh = jh = commit_transaction->t_buffers->b_tnext;
+		do {
+			J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
+					new_jh->b_modified == 0);
+			new_jh->b_modified = 0;
+			new_jh = new_jh->b_tnext;
+		} while (new_jh != jh);
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	/*
+	 * Now start flushing things to disk, in the order they appear
+	 * on the transaction lists.  Data blocks go first.
+	 */
+	err = 0;
+	journal_submit_data_buffers(journal, commit_transaction);
+
+	/*
+	 * Wait for all previously submitted IO to complete.
+	 */
+	spin_lock(&journal->j_list_lock);
+	while (commit_transaction->t_locked_list) {
+		struct buffer_head *bh;
+
+		jh = commit_transaction->t_locked_list->b_tprev;
+		bh = jh2bh(jh);
+		get_bh(bh);
+		if (buffer_locked(bh)) {
+			spin_unlock(&journal->j_list_lock);
+			wait_on_buffer(bh);
+			if (unlikely(!buffer_uptodate(bh)))
+				err = -EIO;
+			spin_lock(&journal->j_list_lock);
+		}
+		if (!inverted_lock(journal, bh)) {
+			put_bh(bh);
+			spin_lock(&journal->j_list_lock);
+			continue;
+		}
+		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
+			__jbd2_journal_unfile_buffer(jh);
+			jbd_unlock_bh_state(bh);
+			jbd2_journal_remove_journal_head(bh);
+			put_bh(bh);
+		} else {
+			jbd_unlock_bh_state(bh);
+		}
+		put_bh(bh);
+		cond_resched_lock(&journal->j_list_lock);
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	if (err)
+		__jbd2_journal_abort_hard(journal);
+
+	jbd2_journal_write_revoke_records(journal, commit_transaction);
+
+	jbd_debug(3, "JBD: commit phase 2\n");
+
+	/*
+	 * If we found any dirty or locked buffers, then we should have
+	 * looped back up to the write_out_data label.  If there weren't
+	 * any then journal_clean_data_list should have wiped the list
+	 * clean by now, so check that it is in fact empty.
+	 */
+	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
+
+	jbd_debug (3, "JBD: commit phase 3\n");
+
+	/*
+	 * Way to go: we have now written out all of the data for a
+	 * transaction!  Now comes the tricky part: we need to write out
+	 * metadata.  Loop over the transaction's entire buffer list:
+	 */
+	commit_transaction->t_state = T_COMMIT;
+
+	descriptor = NULL;
+	bufs = 0;
+	while (commit_transaction->t_buffers) {
+
+		/* Find the next buffer to be journaled... */
+
+		jh = commit_transaction->t_buffers;
+
+		/* If we're in abort mode, we just un-journal the buffer and
+		   release it for background writing. */
+
+		if (is_journal_aborted(journal)) {
+			JBUFFER_TRACE(jh, "journal is aborting: refile");
+			jbd2_journal_refile_buffer(journal, jh);
+			/* If that was the last one, we need to clean up
+			 * any descriptor buffers which may have been
+			 * already allocated, even if we are now
+			 * aborting. */
+			if (!commit_transaction->t_buffers)
+				goto start_journal_io;
+			continue;
+		}
+
+		/* Make sure we have a descriptor block in which to
+		   record the metadata buffer. */
+
+		if (!descriptor) {
+			struct buffer_head *bh;
+
+			J_ASSERT (bufs == 0);
+
+			jbd_debug(4, "JBD: get descriptor\n");
+
+			descriptor = jbd2_journal_get_descriptor_buffer(journal);
+			if (!descriptor) {
+				__jbd2_journal_abort_hard(journal);
+				continue;
+			}
+
+			bh = jh2bh(descriptor);
+			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
+				(unsigned long long)bh->b_blocknr, bh->b_data);
+			header = (journal_header_t *)&bh->b_data[0];
+			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
+			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
+			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
+
+			tagp = &bh->b_data[sizeof(journal_header_t)];
+			space_left = bh->b_size - sizeof(journal_header_t);
+			first_tag = 1;
+			set_buffer_jwrite(bh);
+			set_buffer_dirty(bh);
+			wbuf[bufs++] = bh;
+
+			/* Record it so that we can wait for IO
+                           completion later */
+			BUFFER_TRACE(bh, "ph3: file as descriptor");
+			jbd2_journal_file_buffer(descriptor, commit_transaction,
+					BJ_LogCtl);
+		}
+
+		/* Where is the buffer to be written? */
+
+		err = jbd2_journal_next_log_block(journal, &blocknr);
+		/* If the block mapping failed, just abandon the buffer
+		   and repeat this loop: we'll fall into the
+		   refile-on-abort condition above. */
+		if (err) {
+			__jbd2_journal_abort_hard(journal);
+			continue;
+		}
+
+		/*
+		 * start_this_handle() uses t_outstanding_credits to determine
+		 * the free space in the log, but this counter is changed
+		 * by jbd2_journal_next_log_block() also.
+		 */
+		commit_transaction->t_outstanding_credits--;
+
+		/* Bump b_count to prevent truncate from stumbling over
+                   the shadowed buffer!  @@@ This can go if we ever get
+                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
+		atomic_inc(&jh2bh(jh)->b_count);
+
+		/* Make a temporary IO buffer with which to write it out
+                   (this will requeue both the metadata buffer and the
+                   temporary IO buffer). new_bh goes on BJ_IO*/
+
+		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
+		/*
+		 * akpm: jbd2_journal_write_metadata_buffer() sets
+		 * new_bh->b_transaction to commit_transaction.
+		 * We need to clean this up before we release new_bh
+		 * (which is of type BJ_IO)
+		 */
+		JBUFFER_TRACE(jh, "ph3: write metadata");
+		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
+						      jh, &new_jh, blocknr);
+		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
+		wbuf[bufs++] = jh2bh(new_jh);
+
+		/* Record the new block's tag in the current descriptor
+                   buffer */
+
+		tag_flag = 0;
+		if (flags & 1)
+			tag_flag |= JBD2_FLAG_ESCAPE;
+		if (!first_tag)
+			tag_flag |= JBD2_FLAG_SAME_UUID;
+
+		tag = (journal_block_tag_t *) tagp;
+		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
+		tag->t_flags = cpu_to_be32(tag_flag);
+		tagp += tag_bytes;
+		space_left -= tag_bytes;
+
+		if (first_tag) {
+			memcpy (tagp, journal->j_uuid, 16);
+			tagp += 16;
+			space_left -= 16;
+			first_tag = 0;
+		}
+
+		/* If there's no more to do, or if the descriptor is full,
+		   let the IO rip! */
+
+		if (bufs == journal->j_wbufsize ||
+		    commit_transaction->t_buffers == NULL ||
+		    space_left < tag_bytes + 16) {
+
+			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
+
+			/* Write an end-of-descriptor marker before
+                           submitting the IOs.  "tag" still points to
+                           the last tag we set up. */
+
+			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
+
+start_journal_io:
+			for (i = 0; i < bufs; i++) {
+				struct buffer_head *bh = wbuf[i];
+				lock_buffer(bh);
+				clear_buffer_dirty(bh);
+				set_buffer_uptodate(bh);
+				bh->b_end_io = journal_end_buffer_io_sync;
+				submit_bh(WRITE, bh);
+			}
+			cond_resched();
+
+			/* Force a new descriptor to be generated next
+                           time round the loop. */
+			descriptor = NULL;
+			bufs = 0;
+		}
+	}
+
+	/* Lo and behold: we have just managed to send a transaction to
+           the log.  Before we can commit it, wait for the IO so far to
+           complete.  Control buffers being written are on the
+           transaction's t_log_list queue, and metadata buffers are on
+           the t_iobuf_list queue.
+
+	   Wait for the buffers in reverse order.  That way we are
+	   less likely to be woken up until all IOs have completed, and
+	   so we incur less scheduling load.
+	*/
+
+	jbd_debug(3, "JBD: commit phase 4\n");
+
+	/*
+	 * akpm: these are BJ_IO, and j_list_lock is not needed.
+	 * See __journal_try_to_free_buffer.
+	 */
+wait_for_iobuf:
+	while (commit_transaction->t_iobuf_list != NULL) {
+		struct buffer_head *bh;
+
+		jh = commit_transaction->t_iobuf_list->b_tprev;
+		bh = jh2bh(jh);
+		if (buffer_locked(bh)) {
+			wait_on_buffer(bh);
+			goto wait_for_iobuf;
+		}
+		if (cond_resched())
+			goto wait_for_iobuf;
+
+		if (unlikely(!buffer_uptodate(bh)))
+			err = -EIO;
+
+		clear_buffer_jwrite(bh);
+
+		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
+		jbd2_journal_unfile_buffer(journal, jh);
+
+		/*
+		 * ->t_iobuf_list should contain only dummy buffer_heads
+		 * which were created by jbd2_journal_write_metadata_buffer().
+		 */
+		BUFFER_TRACE(bh, "dumping temporary bh");
+		jbd2_journal_put_journal_head(jh);
+		__brelse(bh);
+		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
+		free_buffer_head(bh);
+
+		/* We also have to unlock and free the corresponding
+                   shadowed buffer */
+		jh = commit_transaction->t_shadow_list->b_tprev;
+		bh = jh2bh(jh);
+		clear_bit(BH_JWrite, &bh->b_state);
+		J_ASSERT_BH(bh, buffer_jbddirty(bh));
+
+		/* The metadata is now released for reuse, but we need
+                   to remember it against this transaction so that when
+                   we finally commit, we can do any checkpointing
+                   required. */
+		JBUFFER_TRACE(jh, "file as BJ_Forget");
+		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
+		/* Wake up any transactions which were waiting for this
+		   IO to complete */
+		wake_up_bit(&bh->b_state, BH_Unshadow);
+		JBUFFER_TRACE(jh, "brelse shadowed buffer");
+		__brelse(bh);
+	}
+
+	J_ASSERT (commit_transaction->t_shadow_list == NULL);
+
+	jbd_debug(3, "JBD: commit phase 5\n");
+
+	/* Here we wait for the revoke record and descriptor record buffers */
+ wait_for_ctlbuf:
+	while (commit_transaction->t_log_list != NULL) {
+		struct buffer_head *bh;
+
+		jh = commit_transaction->t_log_list->b_tprev;
+		bh = jh2bh(jh);
+		if (buffer_locked(bh)) {
+			wait_on_buffer(bh);
+			goto wait_for_ctlbuf;
+		}
+		if (cond_resched())
+			goto wait_for_ctlbuf;
+
+		if (unlikely(!buffer_uptodate(bh)))
+			err = -EIO;
+
+		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
+		clear_buffer_jwrite(bh);
+		jbd2_journal_unfile_buffer(journal, jh);
+		jbd2_journal_put_journal_head(jh);
+		__brelse(bh);		/* One for getblk */
+		/* AKPM: bforget here */
+	}
+
+	jbd_debug(3, "JBD: commit phase 6\n");
+
+	if (journal_write_commit_record(journal, commit_transaction))
+		err = -EIO;
+
+	if (err)
+		__jbd2_journal_abort_hard(journal);
+
+	/* End of a transaction!  Finally, we can do checkpoint
+           processing: any buffers committed as a result of this
+           transaction can be removed from any checkpoint list it was on
+           before. */
+
+	jbd_debug(3, "JBD: commit phase 7\n");
+
+	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+	J_ASSERT(commit_transaction->t_buffers == NULL);
+	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
+	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
+	J_ASSERT(commit_transaction->t_shadow_list == NULL);
+	J_ASSERT(commit_transaction->t_log_list == NULL);
+
+restart_loop:
+	/*
+	 * As there are other places (journal_unmap_buffer()) adding buffers
+	 * to this list we have to be careful and hold the j_list_lock.
+	 */
+	spin_lock(&journal->j_list_lock);
+	while (commit_transaction->t_forget) {
+		transaction_t *cp_transaction;
+		struct buffer_head *bh;
+
+		jh = commit_transaction->t_forget;
+		spin_unlock(&journal->j_list_lock);
+		bh = jh2bh(jh);
+		jbd_lock_bh_state(bh);
+		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
+			jh->b_transaction == journal->j_running_transaction);
+
+		/*
+		 * If there is undo-protected committed data against
+		 * this buffer, then we can remove it now.  If it is a
+		 * buffer needing such protection, the old frozen_data
+		 * field now points to a committed version of the
+		 * buffer, so rotate that field to the new committed
+		 * data.
+		 *
+		 * Otherwise, we can just throw away the frozen data now.
+		 */
+		if (jh->b_committed_data) {
+			jbd2_slab_free(jh->b_committed_data, bh->b_size);
+			jh->b_committed_data = NULL;
+			if (jh->b_frozen_data) {
+				jh->b_committed_data = jh->b_frozen_data;
+				jh->b_frozen_data = NULL;
+			}
+		} else if (jh->b_frozen_data) {
+			jbd2_slab_free(jh->b_frozen_data, bh->b_size);
+			jh->b_frozen_data = NULL;
+		}
+
+		spin_lock(&journal->j_list_lock);
+		cp_transaction = jh->b_cp_transaction;
+		if (cp_transaction) {
+			JBUFFER_TRACE(jh, "remove from old cp transaction");
+			__jbd2_journal_remove_checkpoint(jh);
+		}
+
+		/* Only re-checkpoint the buffer_head if it is marked
+		 * dirty.  If the buffer was added to the BJ_Forget list
+		 * by jbd2_journal_forget, it may no longer be dirty and
+		 * there's no point in keeping a checkpoint record for
+		 * it. */
+
+		/* A buffer which has been freed while still being
+		 * journaled by a previous transaction may end up still
+		 * being dirty here, but we want to avoid writing back
+		 * that buffer in the future now that the last use has
+		 * been committed.  That's not only a performance gain,
+		 * it also stops aliasing problems if the buffer is left
+		 * behind for writeback and gets reallocated for another
+		 * use in a different page. */
+		if (buffer_freed(bh)) {
+			clear_buffer_freed(bh);
+			clear_buffer_jbddirty(bh);
+		}
+
+		if (buffer_jbddirty(bh)) {
+			JBUFFER_TRACE(jh, "add to new checkpointing trans");
+			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
+			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
+			__jbd2_journal_refile_buffer(jh);
+			jbd_unlock_bh_state(bh);
+		} else {
+			J_ASSERT_BH(bh, !buffer_dirty(bh));
+			/* The buffer on BJ_Forget list and not jbddirty means
+			 * it has been freed by this transaction and hence it
+			 * could not have been reallocated until this
+			 * transaction has committed. *BUT* it could be
+			 * reallocated once we have written all the data to
+			 * disk and before we process the buffer on BJ_Forget
+			 * list. */
+			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
+			__jbd2_journal_refile_buffer(jh);
+			if (!jh->b_transaction) {
+				jbd_unlock_bh_state(bh);
+				 /* needs a brelse */
+				jbd2_journal_remove_journal_head(bh);
+				release_buffer_page(bh);
+			} else
+				jbd_unlock_bh_state(bh);
+		}
+		cond_resched_lock(&journal->j_list_lock);
+	}
+	spin_unlock(&journal->j_list_lock);
+	/*
+	 * This is a bit sleazy.  We borrow j_list_lock to protect
+	 * journal->j_committing_transaction in __jbd2_journal_remove_checkpoint.
+	 * Really, __jbd2_journal_remove_checkpoint should be using j_state_lock but
+	 * it's a bit hassle to hold that across __jbd2_journal_remove_checkpoint
+	 */
+	spin_lock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	/*
+	 * Now recheck if some buffers did not get attached to the transaction
+	 * while the lock was dropped...
+	 */
+	if (commit_transaction->t_forget) {
+		spin_unlock(&journal->j_list_lock);
+		spin_unlock(&journal->j_state_lock);
+		goto restart_loop;
+	}
+
+	/* Done with this transaction! */
+
+	jbd_debug(3, "JBD: commit phase 8\n");
+
+	J_ASSERT(commit_transaction->t_state == T_COMMIT);
+
+	commit_transaction->t_state = T_FINISHED;
+	J_ASSERT(commit_transaction == journal->j_committing_transaction);
+	journal->j_commit_sequence = commit_transaction->t_tid;
+	journal->j_committing_transaction = NULL;
+	spin_unlock(&journal->j_state_lock);
+
+	if (commit_transaction->t_checkpoint_list == NULL) {
+		__jbd2_journal_drop_transaction(journal, commit_transaction);
+	} else {
+		if (journal->j_checkpoint_transactions == NULL) {
+			journal->j_checkpoint_transactions = commit_transaction;
+			commit_transaction->t_cpnext = commit_transaction;
+			commit_transaction->t_cpprev = commit_transaction;
+		} else {
+			commit_transaction->t_cpnext =
+				journal->j_checkpoint_transactions;
+			commit_transaction->t_cpprev =
+				commit_transaction->t_cpnext->t_cpprev;
+			commit_transaction->t_cpnext->t_cpprev =
+				commit_transaction;
+			commit_transaction->t_cpprev->t_cpnext =
+				commit_transaction;
+		}
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	jbd_debug(1, "JBD: commit %d complete, head %d\n",
+		  journal->j_commit_sequence, journal->j_tail_sequence);
+
+	wake_up(&journal->j_wait_done_commit);
+}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
new file mode 100644
index 000000000000..c60f378b0f76
--- /dev/null
+++ b/fs/jbd2/journal.c
@@ -0,0 +1,2084 @@
+/*
+ * linux/fs/jbd2/journal.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Generic filesystem journal-writing code; part of the ext2fs
+ * journaling system.
+ *
+ * This file manages journals: areas of disk reserved for logging
+ * transactional updates.  This includes the kernel journaling thread
+ * which is responsible for scheduling updates to the log.
+ *
+ * We do not actually manage the physical storage of the journal in this
+ * file: that is left to a per-journal policy function, which allows us
+ * to store the journal within a filesystem-specified area for ext2
+ * journaling (ext2 can use a reserved inode for storing the log).
+ */
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/pagemap.h>
+#include <linux/kthread.h>
+#include <linux/poison.h>
+#include <linux/proc_fs.h>
+
+#include <asm/uaccess.h>
+#include <asm/page.h>
+
+EXPORT_SYMBOL(jbd2_journal_start);
+EXPORT_SYMBOL(jbd2_journal_restart);
+EXPORT_SYMBOL(jbd2_journal_extend);
+EXPORT_SYMBOL(jbd2_journal_stop);
+EXPORT_SYMBOL(jbd2_journal_lock_updates);
+EXPORT_SYMBOL(jbd2_journal_unlock_updates);
+EXPORT_SYMBOL(jbd2_journal_get_write_access);
+EXPORT_SYMBOL(jbd2_journal_get_create_access);
+EXPORT_SYMBOL(jbd2_journal_get_undo_access);
+EXPORT_SYMBOL(jbd2_journal_dirty_data);
+EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
+EXPORT_SYMBOL(jbd2_journal_release_buffer);
+EXPORT_SYMBOL(jbd2_journal_forget);
+#if 0
+EXPORT_SYMBOL(journal_sync_buffer);
+#endif
+EXPORT_SYMBOL(jbd2_journal_flush);
+EXPORT_SYMBOL(jbd2_journal_revoke);
+
+EXPORT_SYMBOL(jbd2_journal_init_dev);
+EXPORT_SYMBOL(jbd2_journal_init_inode);
+EXPORT_SYMBOL(jbd2_journal_update_format);
+EXPORT_SYMBOL(jbd2_journal_check_used_features);
+EXPORT_SYMBOL(jbd2_journal_check_available_features);
+EXPORT_SYMBOL(jbd2_journal_set_features);
+EXPORT_SYMBOL(jbd2_journal_create);
+EXPORT_SYMBOL(jbd2_journal_load);
+EXPORT_SYMBOL(jbd2_journal_destroy);
+EXPORT_SYMBOL(jbd2_journal_update_superblock);
+EXPORT_SYMBOL(jbd2_journal_abort);
+EXPORT_SYMBOL(jbd2_journal_errno);
+EXPORT_SYMBOL(jbd2_journal_ack_err);
+EXPORT_SYMBOL(jbd2_journal_clear_err);
+EXPORT_SYMBOL(jbd2_log_wait_commit);
+EXPORT_SYMBOL(jbd2_journal_start_commit);
+EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
+EXPORT_SYMBOL(jbd2_journal_wipe);
+EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
+EXPORT_SYMBOL(jbd2_journal_invalidatepage);
+EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
+EXPORT_SYMBOL(jbd2_journal_force_commit);
+
+static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
+static void __journal_abort_soft (journal_t *journal, int errno);
+static int jbd2_journal_create_jbd_slab(size_t slab_size);
+
+/*
+ * Helper function used to manage commit timeouts
+ */
+
+static void commit_timeout(unsigned long __data)
+{
+	struct task_struct * p = (struct task_struct *) __data;
+
+	wake_up_process(p);
+}
+
+/*
+ * kjournald2: The main thread function used to manage a logging device
+ * journal.
+ *
+ * This kernel thread is responsible for two things:
+ *
+ * 1) COMMIT:  Every so often we need to commit the current state of the
+ *    filesystem to disk.  The journal thread is responsible for writing
+ *    all of the metadata buffers to disk.
+ *
+ * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
+ *    of the data in that part of the log has been rewritten elsewhere on
+ *    the disk.  Flushing these old buffers to reclaim space in the log is
+ *    known as checkpointing, and this thread is responsible for that job.
+ */
+
+static int kjournald2(void *arg)
+{
+	journal_t *journal = arg;
+	transaction_t *transaction;
+
+	/*
+	 * Set up an interval timer which can be used to trigger a commit wakeup
+	 * after the commit interval expires
+	 */
+	setup_timer(&journal->j_commit_timer, commit_timeout,
+			(unsigned long)current);
+
+	/* Record that the journal thread is running */
+	journal->j_task = current;
+	wake_up(&journal->j_wait_done_commit);
+
+	printk(KERN_INFO "kjournald2 starting.  Commit interval %ld seconds\n",
+			journal->j_commit_interval / HZ);
+
+	/*
+	 * And now, wait forever for commit wakeup events.
+	 */
+	spin_lock(&journal->j_state_lock);
+
+loop:
+	if (journal->j_flags & JBD2_UNMOUNT)
+		goto end_loop;
+
+	jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
+		journal->j_commit_sequence, journal->j_commit_request);
+
+	if (journal->j_commit_sequence != journal->j_commit_request) {
+		jbd_debug(1, "OK, requests differ\n");
+		spin_unlock(&journal->j_state_lock);
+		del_timer_sync(&journal->j_commit_timer);
+		jbd2_journal_commit_transaction(journal);
+		spin_lock(&journal->j_state_lock);
+		goto loop;
+	}
+
+	wake_up(&journal->j_wait_done_commit);
+	if (freezing(current)) {
+		/*
+		 * The simpler the better. Flushing journal isn't a
+		 * good idea, because that depends on threads that may
+		 * be already stopped.
+		 */
+		jbd_debug(1, "Now suspending kjournald2\n");
+		spin_unlock(&journal->j_state_lock);
+		refrigerator();
+		spin_lock(&journal->j_state_lock);
+	} else {
+		/*
+		 * We assume on resume that commits are already there,
+		 * so we don't sleep
+		 */
+		DEFINE_WAIT(wait);
+		int should_sleep = 1;
+
+		prepare_to_wait(&journal->j_wait_commit, &wait,
+				TASK_INTERRUPTIBLE);
+		if (journal->j_commit_sequence != journal->j_commit_request)
+			should_sleep = 0;
+		transaction = journal->j_running_transaction;
+		if (transaction && time_after_eq(jiffies,
+						transaction->t_expires))
+			should_sleep = 0;
+		if (journal->j_flags & JBD2_UNMOUNT)
+			should_sleep = 0;
+		if (should_sleep) {
+			spin_unlock(&journal->j_state_lock);
+			schedule();
+			spin_lock(&journal->j_state_lock);
+		}
+		finish_wait(&journal->j_wait_commit, &wait);
+	}
+
+	jbd_debug(1, "kjournald2 wakes\n");
+
+	/*
+	 * Were we woken up by a commit wakeup event?
+	 */
+	transaction = journal->j_running_transaction;
+	if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
+		journal->j_commit_request = transaction->t_tid;
+		jbd_debug(1, "woke because of timeout\n");
+	}
+	goto loop;
+
+end_loop:
+	spin_unlock(&journal->j_state_lock);
+	del_timer_sync(&journal->j_commit_timer);
+	journal->j_task = NULL;
+	wake_up(&journal->j_wait_done_commit);
+	jbd_debug(1, "Journal thread exiting.\n");
+	return 0;
+}
+
+static void jbd2_journal_start_thread(journal_t *journal)
+{
+	kthread_run(kjournald2, journal, "kjournald2");
+	wait_event(journal->j_wait_done_commit, journal->j_task != 0);
+}
+
+static void journal_kill_thread(journal_t *journal)
+{
+	spin_lock(&journal->j_state_lock);
+	journal->j_flags |= JBD2_UNMOUNT;
+
+	while (journal->j_task) {
+		wake_up(&journal->j_wait_commit);
+		spin_unlock(&journal->j_state_lock);
+		wait_event(journal->j_wait_done_commit, journal->j_task == 0);
+		spin_lock(&journal->j_state_lock);
+	}
+	spin_unlock(&journal->j_state_lock);
+}
+
+/*
+ * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal.
+ *
+ * Writes a metadata buffer to a given disk block.  The actual IO is not
+ * performed but a new buffer_head is constructed which labels the data
+ * to be written with the correct destination disk block.
+ *
+ * Any magic-number escaping which needs to be done will cause a
+ * copy-out here.  If the buffer happens to start with the
+ * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the
+ * magic number is only written to the log for descripter blocks.  In
+ * this case, we copy the data and replace the first word with 0, and we
+ * return a result code which indicates that this buffer needs to be
+ * marked as an escaped buffer in the corresponding log descriptor
+ * block.  The missing word can then be restored when the block is read
+ * during recovery.
+ *
+ * If the source buffer has already been modified by a new transaction
+ * since we took the last commit snapshot, we use the frozen copy of
+ * that data for IO.  If we end up using the existing buffer_head's data
+ * for the write, then we *have* to lock the buffer to prevent anyone
+ * else from using and possibly modifying it while the IO is in
+ * progress.
+ *
+ * The function returns a pointer to the buffer_heads to be used for IO.
+ *
+ * We assume that the journal has already been locked in this function.
+ *
+ * Return value:
+ *  <0: Error
+ * >=0: Finished OK
+ *
+ * On success:
+ * Bit 0 set == escape performed on the data
+ * Bit 1 set == buffer copy-out performed (kfree the data after IO)
+ */
+
+int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
+				  struct journal_head  *jh_in,
+				  struct journal_head **jh_out,
+				  unsigned long long blocknr)
+{
+	int need_copy_out = 0;
+	int done_copy_out = 0;
+	int do_escape = 0;
+	char *mapped_data;
+	struct buffer_head *new_bh;
+	struct journal_head *new_jh;
+	struct page *new_page;
+	unsigned int new_offset;
+	struct buffer_head *bh_in = jh2bh(jh_in);
+
+	/*
+	 * The buffer really shouldn't be locked: only the current committing
+	 * transaction is allowed to write it, so nobody else is allowed
+	 * to do any IO.
+	 *
+	 * akpm: except if we're journalling data, and write() output is
+	 * also part of a shared mapping, and another thread has
+	 * decided to launch a writepage() against this buffer.
+	 */
+	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
+
+	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+
+	/*
+	 * If a new transaction has already done a buffer copy-out, then
+	 * we use that version of the data for the commit.
+	 */
+	jbd_lock_bh_state(bh_in);
+repeat:
+	if (jh_in->b_frozen_data) {
+		done_copy_out = 1;
+		new_page = virt_to_page(jh_in->b_frozen_data);
+		new_offset = offset_in_page(jh_in->b_frozen_data);
+	} else {
+		new_page = jh2bh(jh_in)->b_page;
+		new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+	}
+
+	mapped_data = kmap_atomic(new_page, KM_USER0);
+	/*
+	 * Check for escaping
+	 */
+	if (*((__be32 *)(mapped_data + new_offset)) ==
+				cpu_to_be32(JBD2_MAGIC_NUMBER)) {
+		need_copy_out = 1;
+		do_escape = 1;
+	}
+	kunmap_atomic(mapped_data, KM_USER0);
+
+	/*
+	 * Do we need to do a data copy?
+	 */
+	if (need_copy_out && !done_copy_out) {
+		char *tmp;
+
+		jbd_unlock_bh_state(bh_in);
+		tmp = jbd2_slab_alloc(bh_in->b_size, GFP_NOFS);
+		jbd_lock_bh_state(bh_in);
+		if (jh_in->b_frozen_data) {
+			jbd2_slab_free(tmp, bh_in->b_size);
+			goto repeat;
+		}
+
+		jh_in->b_frozen_data = tmp;
+		mapped_data = kmap_atomic(new_page, KM_USER0);
+		memcpy(tmp, mapped_data + new_offset, jh2bh(jh_in)->b_size);
+		kunmap_atomic(mapped_data, KM_USER0);
+
+		new_page = virt_to_page(tmp);
+		new_offset = offset_in_page(tmp);
+		done_copy_out = 1;
+	}
+
+	/*
+	 * Did we need to do an escaping?  Now we've done all the
+	 * copying, we can finally do so.
+	 */
+	if (do_escape) {
+		mapped_data = kmap_atomic(new_page, KM_USER0);
+		*((unsigned int *)(mapped_data + new_offset)) = 0;
+		kunmap_atomic(mapped_data, KM_USER0);
+	}
+
+	/* keep subsequent assertions sane */
+	new_bh->b_state = 0;
+	init_buffer(new_bh, NULL, NULL);
+	atomic_set(&new_bh->b_count, 1);
+	jbd_unlock_bh_state(bh_in);
+
+	new_jh = jbd2_journal_add_journal_head(new_bh);	/* This sleeps */
+
+	set_bh_page(new_bh, new_page, new_offset);
+	new_jh->b_transaction = NULL;
+	new_bh->b_size = jh2bh(jh_in)->b_size;
+	new_bh->b_bdev = transaction->t_journal->j_dev;
+	new_bh->b_blocknr = blocknr;
+	set_buffer_mapped(new_bh);
+	set_buffer_dirty(new_bh);
+
+	*jh_out = new_jh;
+
+	/*
+	 * The to-be-written buffer needs to get moved to the io queue,
+	 * and the original buffer whose contents we are shadowing or
+	 * copying is moved to the transaction's shadow queue.
+	 */
+	JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
+	jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
+	JBUFFER_TRACE(new_jh, "file as BJ_IO");
+	jbd2_journal_file_buffer(new_jh, transaction, BJ_IO);
+
+	return do_escape | (done_copy_out << 1);
+}
+
+/*
+ * Allocation code for the journal file.  Manage the space left in the
+ * journal, so that we can begin checkpointing when appropriate.
+ */
+
+/*
+ * __jbd2_log_space_left: Return the number of free blocks left in the journal.
+ *
+ * Called with the journal already locked.
+ *
+ * Called under j_state_lock
+ */
+
+int __jbd2_log_space_left(journal_t *journal)
+{
+	int left = journal->j_free;
+
+	assert_spin_locked(&journal->j_state_lock);
+
+	/*
+	 * Be pessimistic here about the number of those free blocks which
+	 * might be required for log descriptor control blocks.
+	 */
+
+#define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
+
+	left -= MIN_LOG_RESERVED_BLOCKS;
+
+	if (left <= 0)
+		return 0;
+	left -= (left >> 3);
+	return left;
+}
+
+/*
+ * Called under j_state_lock.  Returns true if a transaction was started.
+ */
+int __jbd2_log_start_commit(journal_t *journal, tid_t target)
+{
+	/*
+	 * Are we already doing a recent enough commit?
+	 */
+	if (!tid_geq(journal->j_commit_request, target)) {
+		/*
+		 * We want a new commit: OK, mark the request and wakup the
+		 * commit thread.  We do _not_ do the commit ourselves.
+		 */
+
+		journal->j_commit_request = target;
+		jbd_debug(1, "JBD: requesting commit %d/%d\n",
+			  journal->j_commit_request,
+			  journal->j_commit_sequence);
+		wake_up(&journal->j_wait_commit);
+		return 1;
+	}
+	return 0;
+}
+
+int jbd2_log_start_commit(journal_t *journal, tid_t tid)
+{
+	int ret;
+
+	spin_lock(&journal->j_state_lock);
+	ret = __jbd2_log_start_commit(journal, tid);
+	spin_unlock(&journal->j_state_lock);
+	return ret;
+}
+
+/*
+ * Force and wait upon a commit if the calling process is not within
+ * transaction.  This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
+ *
+ * We can only force the running transaction if we don't have an active handle;
+ * otherwise, we will deadlock.
+ *
+ * Returns true if a transaction was started.
+ */
+int jbd2_journal_force_commit_nested(journal_t *journal)
+{
+	transaction_t *transaction = NULL;
+	tid_t tid;
+
+	spin_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction && !current->journal_info) {
+		transaction = journal->j_running_transaction;
+		__jbd2_log_start_commit(journal, transaction->t_tid);
+	} else if (journal->j_committing_transaction)
+		transaction = journal->j_committing_transaction;
+
+	if (!transaction) {
+		spin_unlock(&journal->j_state_lock);
+		return 0;	/* Nothing to retry */
+	}
+
+	tid = transaction->t_tid;
+	spin_unlock(&journal->j_state_lock);
+	jbd2_log_wait_commit(journal, tid);
+	return 1;
+}
+
+/*
+ * Start a commit of the current running transaction (if any).  Returns true
+ * if a transaction was started, and fills its tid in at *ptid
+ */
+int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
+{
+	int ret = 0;
+
+	spin_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction) {
+		tid_t tid = journal->j_running_transaction->t_tid;
+
+		ret = __jbd2_log_start_commit(journal, tid);
+		if (ret && ptid)
+			*ptid = tid;
+	} else if (journal->j_committing_transaction && ptid) {
+		/*
+		 * If ext3_write_super() recently started a commit, then we
+		 * have to wait for completion of that transaction
+		 */
+		*ptid = journal->j_committing_transaction->t_tid;
+		ret = 1;
+	}
+	spin_unlock(&journal->j_state_lock);
+	return ret;
+}
+
+/*
+ * Wait for a specified commit to complete.
+ * The caller may not hold the journal lock.
+ */
+int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
+{
+	int err = 0;
+
+#ifdef CONFIG_JBD_DEBUG
+	spin_lock(&journal->j_state_lock);
+	if (!tid_geq(journal->j_commit_request, tid)) {
+		printk(KERN_EMERG
+		       "%s: error: j_commit_request=%d, tid=%d\n",
+		       __FUNCTION__, journal->j_commit_request, tid);
+	}
+	spin_unlock(&journal->j_state_lock);
+#endif
+	spin_lock(&journal->j_state_lock);
+	while (tid_gt(tid, journal->j_commit_sequence)) {
+		jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
+				  tid, journal->j_commit_sequence);
+		wake_up(&journal->j_wait_commit);
+		spin_unlock(&journal->j_state_lock);
+		wait_event(journal->j_wait_done_commit,
+				!tid_gt(tid, journal->j_commit_sequence));
+		spin_lock(&journal->j_state_lock);
+	}
+	spin_unlock(&journal->j_state_lock);
+
+	if (unlikely(is_journal_aborted(journal))) {
+		printk(KERN_EMERG "journal commit I/O error\n");
+		err = -EIO;
+	}
+	return err;
+}
+
+/*
+ * Log buffer allocation routines:
+ */
+
+int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
+{
+	unsigned long blocknr;
+
+	spin_lock(&journal->j_state_lock);
+	J_ASSERT(journal->j_free > 1);
+
+	blocknr = journal->j_head;
+	journal->j_head++;
+	journal->j_free--;
+	if (journal->j_head == journal->j_last)
+		journal->j_head = journal->j_first;
+	spin_unlock(&journal->j_state_lock);
+	return jbd2_journal_bmap(journal, blocknr, retp);
+}
+
+/*
+ * Conversion of logical to physical block numbers for the journal
+ *
+ * On external journals the journal blocks are identity-mapped, so
+ * this is a no-op.  If needed, we can use j_blk_offset - everything is
+ * ready.
+ */
+int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
+		 unsigned long long *retp)
+{
+	int err = 0;
+	unsigned long long ret;
+
+	if (journal->j_inode) {
+		ret = bmap(journal->j_inode, blocknr);
+		if (ret)
+			*retp = ret;
+		else {
+			char b[BDEVNAME_SIZE];
+
+			printk(KERN_ALERT "%s: journal block not found "
+					"at offset %lu on %s\n",
+				__FUNCTION__,
+				blocknr,
+				bdevname(journal->j_dev, b));
+			err = -EIO;
+			__journal_abort_soft(journal, err);
+		}
+	} else {
+		*retp = blocknr; /* +journal->j_blk_offset */
+	}
+	return err;
+}
+
+/*
+ * We play buffer_head aliasing tricks to write data/metadata blocks to
+ * the journal without copying their contents, but for journal
+ * descriptor blocks we do need to generate bona fide buffers.
+ *
+ * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
+ * the buffer's contents they really should run flush_dcache_page(bh->b_page).
+ * But we don't bother doing that, so there will be coherency problems with
+ * mmaps of blockdevs which hold live JBD-controlled filesystems.
+ */
+struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
+{
+	struct buffer_head *bh;
+	unsigned long long blocknr;
+	int err;
+
+	err = jbd2_journal_next_log_block(journal, &blocknr);
+
+	if (err)
+		return NULL;
+
+	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+	lock_buffer(bh);
+	memset(bh->b_data, 0, journal->j_blocksize);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+	BUFFER_TRACE(bh, "return this buffer");
+	return jbd2_journal_add_journal_head(bh);
+}
+
+/*
+ * Management for journal control blocks: functions to create and
+ * destroy journal_t structures, and to initialise and read existing
+ * journal blocks from disk.  */
+
+/* First: create and setup a journal_t object in memory.  We initialise
+ * very few fields yet: that has to wait until we have created the
+ * journal structures from from scratch, or loaded them from disk. */
+
+static journal_t * journal_init_common (void)
+{
+	journal_t *journal;
+	int err;
+
+	journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL);
+	if (!journal)
+		goto fail;
+	memset(journal, 0, sizeof(*journal));
+
+	init_waitqueue_head(&journal->j_wait_transaction_locked);
+	init_waitqueue_head(&journal->j_wait_logspace);
+	init_waitqueue_head(&journal->j_wait_done_commit);
+	init_waitqueue_head(&journal->j_wait_checkpoint);
+	init_waitqueue_head(&journal->j_wait_commit);
+	init_waitqueue_head(&journal->j_wait_updates);
+	mutex_init(&journal->j_barrier);
+	mutex_init(&journal->j_checkpoint_mutex);
+	spin_lock_init(&journal->j_revoke_lock);
+	spin_lock_init(&journal->j_list_lock);
+	spin_lock_init(&journal->j_state_lock);
+
+	journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE);
+
+	/* The journal is marked for error until we succeed with recovery! */
+	journal->j_flags = JBD2_ABORT;
+
+	/* Set up a default-sized revoke table for the new mount. */
+	err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
+	if (err) {
+		kfree(journal);
+		goto fail;
+	}
+	return journal;
+fail:
+	return NULL;
+}
+
+/* jbd2_journal_init_dev and jbd2_journal_init_inode:
+ *
+ * Create a journal structure assigned some fixed set of disk blocks to
+ * the journal.  We don't actually touch those disk blocks yet, but we
+ * need to set up all of the mapping information to tell the journaling
+ * system where the journal blocks are.
+ *
+ */
+
+/**
+ *  journal_t * jbd2_journal_init_dev() - creates an initialises a journal structure
+ *  @bdev: Block device on which to create the journal
+ *  @fs_dev: Device which hold journalled filesystem for this journal.
+ *  @start: Block nr Start of journal.
+ *  @len:  Length of the journal in blocks.
+ *  @blocksize: blocksize of journalling device
+ *  @returns: a newly created journal_t *
+ *
+ *  jbd2_journal_init_dev creates a journal which maps a fixed contiguous
+ *  range of blocks on an arbitrary block device.
+ *
+ */
+journal_t * jbd2_journal_init_dev(struct block_device *bdev,
+			struct block_device *fs_dev,
+			unsigned long long start, int len, int blocksize)
+{
+	journal_t *journal = journal_init_common();
+	struct buffer_head *bh;
+	int n;
+
+	if (!journal)
+		return NULL;
+
+	/* journal descriptor can store up to n blocks -bzzz */
+	journal->j_blocksize = blocksize;
+	n = journal->j_blocksize / sizeof(journal_block_tag_t);
+	journal->j_wbufsize = n;
+	journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
+	if (!journal->j_wbuf) {
+		printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
+			__FUNCTION__);
+		kfree(journal);
+		journal = NULL;
+		goto out;
+	}
+	journal->j_dev = bdev;
+	journal->j_fs_dev = fs_dev;
+	journal->j_blk_offset = start;
+	journal->j_maxlen = len;
+
+	bh = __getblk(journal->j_dev, start, journal->j_blocksize);
+	J_ASSERT(bh != NULL);
+	journal->j_sb_buffer = bh;
+	journal->j_superblock = (journal_superblock_t *)bh->b_data;
+out:
+	return journal;
+}
+
+/**
+ *  journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode.
+ *  @inode: An inode to create the journal in
+ *
+ * jbd2_journal_init_inode creates a journal which maps an on-disk inode as
+ * the journal.  The inode must exist already, must support bmap() and
+ * must have all data blocks preallocated.
+ */
+journal_t * jbd2_journal_init_inode (struct inode *inode)
+{
+	struct buffer_head *bh;
+	journal_t *journal = journal_init_common();
+	int err;
+	int n;
+	unsigned long long blocknr;
+
+	if (!journal)
+		return NULL;
+
+	journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
+	journal->j_inode = inode;
+	jbd_debug(1,
+		  "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
+		  journal, inode->i_sb->s_id, inode->i_ino,
+		  (long long) inode->i_size,
+		  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
+
+	journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
+	journal->j_blocksize = inode->i_sb->s_blocksize;
+
+	/* journal descriptor can store up to n blocks -bzzz */
+	n = journal->j_blocksize / sizeof(journal_block_tag_t);
+	journal->j_wbufsize = n;
+	journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
+	if (!journal->j_wbuf) {
+		printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
+			__FUNCTION__);
+		kfree(journal);
+		return NULL;
+	}
+
+	err = jbd2_journal_bmap(journal, 0, &blocknr);
+	/* If that failed, give up */
+	if (err) {
+		printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
+		       __FUNCTION__);
+		kfree(journal);
+		return NULL;
+	}
+
+	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+	J_ASSERT(bh != NULL);
+	journal->j_sb_buffer = bh;
+	journal->j_superblock = (journal_superblock_t *)bh->b_data;
+
+	return journal;
+}
+
+/*
+ * If the journal init or create aborts, we need to mark the journal
+ * superblock as being NULL to prevent the journal destroy from writing
+ * back a bogus superblock.
+ */
+static void journal_fail_superblock (journal_t *journal)
+{
+	struct buffer_head *bh = journal->j_sb_buffer;
+	brelse(bh);
+	journal->j_sb_buffer = NULL;
+}
+
+/*
+ * Given a journal_t structure, initialise the various fields for
+ * startup of a new journaling session.  We use this both when creating
+ * a journal, and after recovering an old journal to reset it for
+ * subsequent use.
+ */
+
+static int journal_reset(journal_t *journal)
+{
+	journal_superblock_t *sb = journal->j_superblock;
+	unsigned long long first, last;
+
+	first = be32_to_cpu(sb->s_first);
+	last = be32_to_cpu(sb->s_maxlen);
+
+	journal->j_first = first;
+	journal->j_last = last;
+
+	journal->j_head = first;
+	journal->j_tail = first;
+	journal->j_free = last - first;
+
+	journal->j_tail_sequence = journal->j_transaction_sequence;
+	journal->j_commit_sequence = journal->j_transaction_sequence - 1;
+	journal->j_commit_request = journal->j_commit_sequence;
+
+	journal->j_max_transaction_buffers = journal->j_maxlen / 4;
+
+	/* Add the dynamic fields and write it to disk. */
+	jbd2_journal_update_superblock(journal, 1);
+	jbd2_journal_start_thread(journal);
+	return 0;
+}
+
+/**
+ * int jbd2_journal_create() - Initialise the new journal file
+ * @journal: Journal to create. This structure must have been initialised
+ *
+ * Given a journal_t structure which tells us which disk blocks we can
+ * use, create a new journal superblock and initialise all of the
+ * journal fields from scratch.
+ **/
+int jbd2_journal_create(journal_t *journal)
+{
+	unsigned long long blocknr;
+	struct buffer_head *bh;
+	journal_superblock_t *sb;
+	int i, err;
+
+	if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) {
+		printk (KERN_ERR "Journal length (%d blocks) too short.\n",
+			journal->j_maxlen);
+		journal_fail_superblock(journal);
+		return -EINVAL;
+	}
+
+	if (journal->j_inode == NULL) {
+		/*
+		 * We don't know what block to start at!
+		 */
+		printk(KERN_EMERG
+		       "%s: creation of journal on external device!\n",
+		       __FUNCTION__);
+		BUG();
+	}
+
+	/* Zero out the entire journal on disk.  We cannot afford to
+	   have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */
+	jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
+	for (i = 0; i < journal->j_maxlen; i++) {
+		err = jbd2_journal_bmap(journal, i, &blocknr);
+		if (err)
+			return err;
+		bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+		lock_buffer(bh);
+		memset (bh->b_data, 0, journal->j_blocksize);
+		BUFFER_TRACE(bh, "marking dirty");
+		mark_buffer_dirty(bh);
+		BUFFER_TRACE(bh, "marking uptodate");
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		__brelse(bh);
+	}
+
+	sync_blockdev(journal->j_dev);
+	jbd_debug(1, "JBD: journal cleared.\n");
+
+	/* OK, fill in the initial static fields in the new superblock */
+	sb = journal->j_superblock;
+
+	sb->s_header.h_magic	 = cpu_to_be32(JBD2_MAGIC_NUMBER);
+	sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
+
+	sb->s_blocksize	= cpu_to_be32(journal->j_blocksize);
+	sb->s_maxlen	= cpu_to_be32(journal->j_maxlen);
+	sb->s_first	= cpu_to_be32(1);
+
+	journal->j_transaction_sequence = 1;
+
+	journal->j_flags &= ~JBD2_ABORT;
+	journal->j_format_version = 2;
+
+	return journal_reset(journal);
+}
+
+/**
+ * void jbd2_journal_update_superblock() - Update journal sb on disk.
+ * @journal: The journal to update.
+ * @wait: Set to '0' if you don't want to wait for IO completion.
+ *
+ * Update a journal's dynamic superblock fields and write it to disk,
+ * optionally waiting for the IO to complete.
+ */
+void jbd2_journal_update_superblock(journal_t *journal, int wait)
+{
+	journal_superblock_t *sb = journal->j_superblock;
+	struct buffer_head *bh = journal->j_sb_buffer;
+
+	/*
+	 * As a special case, if the on-disk copy is already marked as needing
+	 * no recovery (s_start == 0) and there are no outstanding transactions
+	 * in the filesystem, then we can safely defer the superblock update
+	 * until the next commit by setting JBD2_FLUSHED.  This avoids
+	 * attempting a write to a potential-readonly device.
+	 */
+	if (sb->s_start == 0 && journal->j_tail_sequence ==
+				journal->j_transaction_sequence) {
+		jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
+			"(start %ld, seq %d, errno %d)\n",
+			journal->j_tail, journal->j_tail_sequence,
+			journal->j_errno);
+		goto out;
+	}
+
+	spin_lock(&journal->j_state_lock);
+	jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
+		  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
+
+	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
+	sb->s_start    = cpu_to_be32(journal->j_tail);
+	sb->s_errno    = cpu_to_be32(journal->j_errno);
+	spin_unlock(&journal->j_state_lock);
+
+	BUFFER_TRACE(bh, "marking dirty");
+	mark_buffer_dirty(bh);
+	if (wait)
+		sync_dirty_buffer(bh);
+	else
+		ll_rw_block(SWRITE, 1, &bh);
+
+out:
+	/* If we have just flushed the log (by marking s_start==0), then
+	 * any future commit will have to be careful to update the
+	 * superblock again to re-record the true start of the log. */
+
+	spin_lock(&journal->j_state_lock);
+	if (sb->s_start)
+		journal->j_flags &= ~JBD2_FLUSHED;
+	else
+		journal->j_flags |= JBD2_FLUSHED;
+	spin_unlock(&journal->j_state_lock);
+}
+
+/*
+ * Read the superblock for a given journal, performing initial
+ * validation of the format.
+ */
+
+static int journal_get_superblock(journal_t *journal)
+{
+	struct buffer_head *bh;
+	journal_superblock_t *sb;
+	int err = -EIO;
+
+	bh = journal->j_sb_buffer;
+
+	J_ASSERT(bh != NULL);
+	if (!buffer_uptodate(bh)) {
+		ll_rw_block(READ, 1, &bh);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			printk (KERN_ERR
+				"JBD: IO error reading journal superblock\n");
+			goto out;
+		}
+	}
+
+	sb = journal->j_superblock;
+
+	err = -EINVAL;
+
+	if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
+	    sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
+		printk(KERN_WARNING "JBD: no valid journal superblock found\n");
+		goto out;
+	}
+
+	switch(be32_to_cpu(sb->s_header.h_blocktype)) {
+	case JBD2_SUPERBLOCK_V1:
+		journal->j_format_version = 1;
+		break;
+	case JBD2_SUPERBLOCK_V2:
+		journal->j_format_version = 2;
+		break;
+	default:
+		printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
+		goto out;
+	}
+
+	if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
+		journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
+	else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
+		printk (KERN_WARNING "JBD: journal file too short\n");
+		goto out;
+	}
+
+	return 0;
+
+out:
+	journal_fail_superblock(journal);
+	return err;
+}
+
+/*
+ * Load the on-disk journal superblock and read the key fields into the
+ * journal_t.
+ */
+
+static int load_superblock(journal_t *journal)
+{
+	int err;
+	journal_superblock_t *sb;
+
+	err = journal_get_superblock(journal);
+	if (err)
+		return err;
+
+	sb = journal->j_superblock;
+
+	journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
+	journal->j_tail = be32_to_cpu(sb->s_start);
+	journal->j_first = be32_to_cpu(sb->s_first);
+	journal->j_last = be32_to_cpu(sb->s_maxlen);
+	journal->j_errno = be32_to_cpu(sb->s_errno);
+
+	return 0;
+}
+
+
+/**
+ * int jbd2_journal_load() - Read journal from disk.
+ * @journal: Journal to act on.
+ *
+ * Given a journal_t structure which tells us which disk blocks contain
+ * a journal, read the journal from disk to initialise the in-memory
+ * structures.
+ */
+int jbd2_journal_load(journal_t *journal)
+{
+	int err;
+	journal_superblock_t *sb;
+
+	err = load_superblock(journal);
+	if (err)
+		return err;
+
+	sb = journal->j_superblock;
+	/* If this is a V2 superblock, then we have to check the
+	 * features flags on it. */
+
+	if (journal->j_format_version >= 2) {
+		if ((sb->s_feature_ro_compat &
+		     ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
+		    (sb->s_feature_incompat &
+		     ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
+			printk (KERN_WARNING
+				"JBD: Unrecognised features on journal\n");
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * Create a slab for this blocksize
+	 */
+	err = jbd2_journal_create_jbd_slab(be32_to_cpu(sb->s_blocksize));
+	if (err)
+		return err;
+
+	/* Let the recovery code check whether it needs to recover any
+	 * data from the journal. */
+	if (jbd2_journal_recover(journal))
+		goto recovery_error;
+
+	/* OK, we've finished with the dynamic journal bits:
+	 * reinitialise the dynamic contents of the superblock in memory
+	 * and reset them on disk. */
+	if (journal_reset(journal))
+		goto recovery_error;
+
+	journal->j_flags &= ~JBD2_ABORT;
+	journal->j_flags |= JBD2_LOADED;
+	return 0;
+
+recovery_error:
+	printk (KERN_WARNING "JBD: recovery failed\n");
+	return -EIO;
+}
+
+/**
+ * void jbd2_journal_destroy() - Release a journal_t structure.
+ * @journal: Journal to act on.
+ *
+ * Release a journal_t structure once it is no longer in use by the
+ * journaled object.
+ */
+void jbd2_journal_destroy(journal_t *journal)
+{
+	/* Wait for the commit thread to wake up and die. */
+	journal_kill_thread(journal);
+
+	/* Force a final log commit */
+	if (journal->j_running_transaction)
+		jbd2_journal_commit_transaction(journal);
+
+	/* Force any old transactions to disk */
+
+	/* Totally anal locking here... */
+	spin_lock(&journal->j_list_lock);
+	while (journal->j_checkpoint_transactions != NULL) {
+		spin_unlock(&journal->j_list_lock);
+		jbd2_log_do_checkpoint(journal);
+		spin_lock(&journal->j_list_lock);
+	}
+
+	J_ASSERT(journal->j_running_transaction == NULL);
+	J_ASSERT(journal->j_committing_transaction == NULL);
+	J_ASSERT(journal->j_checkpoint_transactions == NULL);
+	spin_unlock(&journal->j_list_lock);
+
+	/* We can now mark the journal as empty. */
+	journal->j_tail = 0;
+	journal->j_tail_sequence = ++journal->j_transaction_sequence;
+	if (journal->j_sb_buffer) {
+		jbd2_journal_update_superblock(journal, 1);
+		brelse(journal->j_sb_buffer);
+	}
+
+	if (journal->j_inode)
+		iput(journal->j_inode);
+	if (journal->j_revoke)
+		jbd2_journal_destroy_revoke(journal);
+	kfree(journal->j_wbuf);
+	kfree(journal);
+}
+
+
+/**
+ *int jbd2_journal_check_used_features () - Check if features specified are used.
+ * @journal: Journal to check.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Check whether the journal uses all of a given set of
+ * features.  Return true (non-zero) if it does.
+ **/
+
+int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
+				 unsigned long ro, unsigned long incompat)
+{
+	journal_superblock_t *sb;
+
+	if (!compat && !ro && !incompat)
+		return 1;
+	if (journal->j_format_version == 1)
+		return 0;
+
+	sb = journal->j_superblock;
+
+	if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
+	    ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
+	    ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
+		return 1;
+
+	return 0;
+}
+
+/**
+ * int jbd2_journal_check_available_features() - Check feature set in journalling layer
+ * @journal: Journal to check.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Check whether the journaling code supports the use of
+ * all of a given set of features on this journal.  Return true
+ * (non-zero) if it can. */
+
+int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
+				      unsigned long ro, unsigned long incompat)
+{
+	journal_superblock_t *sb;
+
+	if (!compat && !ro && !incompat)
+		return 1;
+
+	sb = journal->j_superblock;
+
+	/* We can support any known requested features iff the
+	 * superblock is in version 2.  Otherwise we fail to support any
+	 * extended sb features. */
+
+	if (journal->j_format_version != 2)
+		return 0;
+
+	if ((compat   & JBD2_KNOWN_COMPAT_FEATURES) == compat &&
+	    (ro       & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro &&
+	    (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * int jbd2_journal_set_features () - Mark a given journal feature in the superblock
+ * @journal: Journal to act on.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Mark a given journal feature as present on the
+ * superblock.  Returns true if the requested features could be set.
+ *
+ */
+
+int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
+			  unsigned long ro, unsigned long incompat)
+{
+	journal_superblock_t *sb;
+
+	if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
+		return 1;
+
+	if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
+		return 0;
+
+	jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
+		  compat, ro, incompat);
+
+	sb = journal->j_superblock;
+
+	sb->s_feature_compat    |= cpu_to_be32(compat);
+	sb->s_feature_ro_compat |= cpu_to_be32(ro);
+	sb->s_feature_incompat  |= cpu_to_be32(incompat);
+
+	return 1;
+}
+
+
+/**
+ * int jbd2_journal_update_format () - Update on-disk journal structure.
+ * @journal: Journal to act on.
+ *
+ * Given an initialised but unloaded journal struct, poke about in the
+ * on-disk structure to update it to the most recent supported version.
+ */
+int jbd2_journal_update_format (journal_t *journal)
+{
+	journal_superblock_t *sb;
+	int err;
+
+	err = journal_get_superblock(journal);
+	if (err)
+		return err;
+
+	sb = journal->j_superblock;
+
+	switch (be32_to_cpu(sb->s_header.h_blocktype)) {
+	case JBD2_SUPERBLOCK_V2:
+		return 0;
+	case JBD2_SUPERBLOCK_V1:
+		return journal_convert_superblock_v1(journal, sb);
+	default:
+		break;
+	}
+	return -EINVAL;
+}
+
+static int journal_convert_superblock_v1(journal_t *journal,
+					 journal_superblock_t *sb)
+{
+	int offset, blocksize;
+	struct buffer_head *bh;
+
+	printk(KERN_WARNING
+		"JBD: Converting superblock from version 1 to 2.\n");
+
+	/* Pre-initialise new fields to zero */
+	offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
+	blocksize = be32_to_cpu(sb->s_blocksize);
+	memset(&sb->s_feature_compat, 0, blocksize-offset);
+
+	sb->s_nr_users = cpu_to_be32(1);
+	sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
+	journal->j_format_version = 2;
+
+	bh = journal->j_sb_buffer;
+	BUFFER_TRACE(bh, "marking dirty");
+	mark_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+	return 0;
+}
+
+
+/**
+ * int jbd2_journal_flush () - Flush journal
+ * @journal: Journal to act on.
+ *
+ * Flush all data for a given journal to disk and empty the journal.
+ * Filesystems can use this when remounting readonly to ensure that
+ * recovery does not need to happen on remount.
+ */
+
+int jbd2_journal_flush(journal_t *journal)
+{
+	int err = 0;
+	transaction_t *transaction = NULL;
+	unsigned long old_tail;
+
+	spin_lock(&journal->j_state_lock);
+
+	/* Force everything buffered to the log... */
+	if (journal->j_running_transaction) {
+		transaction = journal->j_running_transaction;
+		__jbd2_log_start_commit(journal, transaction->t_tid);
+	} else if (journal->j_committing_transaction)
+		transaction = journal->j_committing_transaction;
+
+	/* Wait for the log commit to complete... */
+	if (transaction) {
+		tid_t tid = transaction->t_tid;
+
+		spin_unlock(&journal->j_state_lock);
+		jbd2_log_wait_commit(journal, tid);
+	} else {
+		spin_unlock(&journal->j_state_lock);
+	}
+
+	/* ...and flush everything in the log out to disk. */
+	spin_lock(&journal->j_list_lock);
+	while (!err && journal->j_checkpoint_transactions != NULL) {
+		spin_unlock(&journal->j_list_lock);
+		err = jbd2_log_do_checkpoint(journal);
+		spin_lock(&journal->j_list_lock);
+	}
+	spin_unlock(&journal->j_list_lock);
+	jbd2_cleanup_journal_tail(journal);
+
+	/* Finally, mark the journal as really needing no recovery.
+	 * This sets s_start==0 in the underlying superblock, which is
+	 * the magic code for a fully-recovered superblock.  Any future
+	 * commits of data to the journal will restore the current
+	 * s_start value. */
+	spin_lock(&journal->j_state_lock);
+	old_tail = journal->j_tail;
+	journal->j_tail = 0;
+	spin_unlock(&journal->j_state_lock);
+	jbd2_journal_update_superblock(journal, 1);
+	spin_lock(&journal->j_state_lock);
+	journal->j_tail = old_tail;
+
+	J_ASSERT(!journal->j_running_transaction);
+	J_ASSERT(!journal->j_committing_transaction);
+	J_ASSERT(!journal->j_checkpoint_transactions);
+	J_ASSERT(journal->j_head == journal->j_tail);
+	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
+	spin_unlock(&journal->j_state_lock);
+	return err;
+}
+
+/**
+ * int jbd2_journal_wipe() - Wipe journal contents
+ * @journal: Journal to act on.
+ * @write: flag (see below)
+ *
+ * Wipe out all of the contents of a journal, safely.  This will produce
+ * a warning if the journal contains any valid recovery information.
+ * Must be called between journal_init_*() and jbd2_journal_load().
+ *
+ * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
+ * we merely suppress recovery.
+ */
+
+int jbd2_journal_wipe(journal_t *journal, int write)
+{
+	journal_superblock_t *sb;
+	int err = 0;
+
+	J_ASSERT (!(journal->j_flags & JBD2_LOADED));
+
+	err = load_superblock(journal);
+	if (err)
+		return err;
+
+	sb = journal->j_superblock;
+
+	if (!journal->j_tail)
+		goto no_recovery;
+
+	printk (KERN_WARNING "JBD: %s recovery information on journal\n",
+		write ? "Clearing" : "Ignoring");
+
+	err = jbd2_journal_skip_recovery(journal);
+	if (write)
+		jbd2_journal_update_superblock(journal, 1);
+
+ no_recovery:
+	return err;
+}
+
+/*
+ * journal_dev_name: format a character string to describe on what
+ * device this journal is present.
+ */
+
+static const char *journal_dev_name(journal_t *journal, char *buffer)
+{
+	struct block_device *bdev;
+
+	if (journal->j_inode)
+		bdev = journal->j_inode->i_sb->s_bdev;
+	else
+		bdev = journal->j_dev;
+
+	return bdevname(bdev, buffer);
+}
+
+/*
+ * Journal abort has very specific semantics, which we describe
+ * for journal abort.
+ *
+ * Two internal function, which provide abort to te jbd layer
+ * itself are here.
+ */
+
+/*
+ * Quick version for internal journal use (doesn't lock the journal).
+ * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
+ * and don't attempt to make any other journal updates.
+ */
+void __jbd2_journal_abort_hard(journal_t *journal)
+{
+	transaction_t *transaction;
+	char b[BDEVNAME_SIZE];
+
+	if (journal->j_flags & JBD2_ABORT)
+		return;
+
+	printk(KERN_ERR "Aborting journal on device %s.\n",
+		journal_dev_name(journal, b));
+
+	spin_lock(&journal->j_state_lock);
+	journal->j_flags |= JBD2_ABORT;
+	transaction = journal->j_running_transaction;
+	if (transaction)
+		__jbd2_log_start_commit(journal, transaction->t_tid);
+	spin_unlock(&journal->j_state_lock);
+}
+
+/* Soft abort: record the abort error status in the journal superblock,
+ * but don't do any other IO. */
+static void __journal_abort_soft (journal_t *journal, int errno)
+{
+	if (journal->j_flags & JBD2_ABORT)
+		return;
+
+	if (!journal->j_errno)
+		journal->j_errno = errno;
+
+	__jbd2_journal_abort_hard(journal);
+
+	if (errno)
+		jbd2_journal_update_superblock(journal, 1);
+}
+
+/**
+ * void jbd2_journal_abort () - Shutdown the journal immediately.
+ * @journal: the journal to shutdown.
+ * @errno:   an error number to record in the journal indicating
+ *           the reason for the shutdown.
+ *
+ * Perform a complete, immediate shutdown of the ENTIRE
+ * journal (not of a single transaction).  This operation cannot be
+ * undone without closing and reopening the journal.
+ *
+ * The jbd2_journal_abort function is intended to support higher level error
+ * recovery mechanisms such as the ext2/ext3 remount-readonly error
+ * mode.
+ *
+ * Journal abort has very specific semantics.  Any existing dirty,
+ * unjournaled buffers in the main filesystem will still be written to
+ * disk by bdflush, but the journaling mechanism will be suspended
+ * immediately and no further transaction commits will be honoured.
+ *
+ * Any dirty, journaled buffers will be written back to disk without
+ * hitting the journal.  Atomicity cannot be guaranteed on an aborted
+ * filesystem, but we _do_ attempt to leave as much data as possible
+ * behind for fsck to use for cleanup.
+ *
+ * Any attempt to get a new transaction handle on a journal which is in
+ * ABORT state will just result in an -EROFS error return.  A
+ * jbd2_journal_stop on an existing handle will return -EIO if we have
+ * entered abort state during the update.
+ *
+ * Recursive transactions are not disturbed by journal abort until the
+ * final jbd2_journal_stop, which will receive the -EIO error.
+ *
+ * Finally, the jbd2_journal_abort call allows the caller to supply an errno
+ * which will be recorded (if possible) in the journal superblock.  This
+ * allows a client to record failure conditions in the middle of a
+ * transaction without having to complete the transaction to record the
+ * failure to disk.  ext3_error, for example, now uses this
+ * functionality.
+ *
+ * Errors which originate from within the journaling layer will NOT
+ * supply an errno; a null errno implies that absolutely no further
+ * writes are done to the journal (unless there are any already in
+ * progress).
+ *
+ */
+
+void jbd2_journal_abort(journal_t *journal, int errno)
+{
+	__journal_abort_soft(journal, errno);
+}
+
+/**
+ * int jbd2_journal_errno () - returns the journal's error state.
+ * @journal: journal to examine.
+ *
+ * This is the errno numbet set with jbd2_journal_abort(), the last
+ * time the journal was mounted - if the journal was stopped
+ * without calling abort this will be 0.
+ *
+ * If the journal has been aborted on this mount time -EROFS will
+ * be returned.
+ */
+int jbd2_journal_errno(journal_t *journal)
+{
+	int err;
+
+	spin_lock(&journal->j_state_lock);
+	if (journal->j_flags & JBD2_ABORT)
+		err = -EROFS;
+	else
+		err = journal->j_errno;
+	spin_unlock(&journal->j_state_lock);
+	return err;
+}
+
+/**
+ * int jbd2_journal_clear_err () - clears the journal's error state
+ * @journal: journal to act on.
+ *
+ * An error must be cleared or Acked to take a FS out of readonly
+ * mode.
+ */
+int jbd2_journal_clear_err(journal_t *journal)
+{
+	int err = 0;
+
+	spin_lock(&journal->j_state_lock);
+	if (journal->j_flags & JBD2_ABORT)
+		err = -EROFS;
+	else
+		journal->j_errno = 0;
+	spin_unlock(&journal->j_state_lock);
+	return err;
+}
+
+/**
+ * void jbd2_journal_ack_err() - Ack journal err.
+ * @journal: journal to act on.
+ *
+ * An error must be cleared or Acked to take a FS out of readonly
+ * mode.
+ */
+void jbd2_journal_ack_err(journal_t *journal)
+{
+	spin_lock(&journal->j_state_lock);
+	if (journal->j_errno)
+		journal->j_flags |= JBD2_ACK_ERR;
+	spin_unlock(&journal->j_state_lock);
+}
+
+int jbd2_journal_blocks_per_page(struct inode *inode)
+{
+	return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+}
+
+/*
+ * helper functions to deal with 32 or 64bit block numbers.
+ */
+size_t journal_tag_bytes(journal_t *journal)
+{
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+		return JBD_TAG_SIZE64;
+	else
+		return JBD_TAG_SIZE32;
+}
+
+/*
+ * Simple support for retrying memory allocations.  Introduced to help to
+ * debug different VM deadlock avoidance strategies.
+ */
+void * __jbd2_kmalloc (const char *where, size_t size, gfp_t flags, int retry)
+{
+	return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0));
+}
+
+/*
+ * jbd slab management: create 1k, 2k, 4k, 8k slabs as needed
+ * and allocate frozen and commit buffers from these slabs.
+ *
+ * Reason for doing this is to avoid, SLAB_DEBUG - since it could
+ * cause bh to cross page boundary.
+ */
+
+#define JBD_MAX_SLABS 5
+#define JBD_SLAB_INDEX(size)  (size >> 11)
+
+static kmem_cache_t *jbd_slab[JBD_MAX_SLABS];
+static const char *jbd_slab_names[JBD_MAX_SLABS] = {
+	"jbd2_1k", "jbd2_2k", "jbd2_4k", NULL, "jbd2_8k"
+};
+
+static void jbd2_journal_destroy_jbd_slabs(void)
+{
+	int i;
+
+	for (i = 0; i < JBD_MAX_SLABS; i++) {
+		if (jbd_slab[i])
+			kmem_cache_destroy(jbd_slab[i]);
+		jbd_slab[i] = NULL;
+	}
+}
+
+static int jbd2_journal_create_jbd_slab(size_t slab_size)
+{
+	int i = JBD_SLAB_INDEX(slab_size);
+
+	BUG_ON(i >= JBD_MAX_SLABS);
+
+	/*
+	 * Check if we already have a slab created for this size
+	 */
+	if (jbd_slab[i])
+		return 0;
+
+	/*
+	 * Create a slab and force alignment to be same as slabsize -
+	 * this will make sure that allocations won't cross the page
+	 * boundary.
+	 */
+	jbd_slab[i] = kmem_cache_create(jbd_slab_names[i],
+				slab_size, slab_size, 0, NULL, NULL);
+	if (!jbd_slab[i]) {
+		printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void * jbd2_slab_alloc(size_t size, gfp_t flags)
+{
+	int idx;
+
+	idx = JBD_SLAB_INDEX(size);
+	BUG_ON(jbd_slab[idx] == NULL);
+	return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL);
+}
+
+void jbd2_slab_free(void *ptr,  size_t size)
+{
+	int idx;
+
+	idx = JBD_SLAB_INDEX(size);
+	BUG_ON(jbd_slab[idx] == NULL);
+	kmem_cache_free(jbd_slab[idx], ptr);
+}
+
+/*
+ * Journal_head storage management
+ */
+static kmem_cache_t *jbd2_journal_head_cache;
+#ifdef CONFIG_JBD_DEBUG
+static atomic_t nr_journal_heads = ATOMIC_INIT(0);
+#endif
+
+static int journal_init_jbd2_journal_head_cache(void)
+{
+	int retval;
+
+	J_ASSERT(jbd2_journal_head_cache == 0);
+	jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
+				sizeof(struct journal_head),
+				0,		/* offset */
+				0,		/* flags */
+				NULL,		/* ctor */
+				NULL);		/* dtor */
+	retval = 0;
+	if (jbd2_journal_head_cache == 0) {
+		retval = -ENOMEM;
+		printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
+	}
+	return retval;
+}
+
+static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
+{
+	J_ASSERT(jbd2_journal_head_cache != NULL);
+	kmem_cache_destroy(jbd2_journal_head_cache);
+	jbd2_journal_head_cache = NULL;
+}
+
+/*
+ * journal_head splicing and dicing
+ */
+static struct journal_head *journal_alloc_journal_head(void)
+{
+	struct journal_head *ret;
+	static unsigned long last_warning;
+
+#ifdef CONFIG_JBD_DEBUG
+	atomic_inc(&nr_journal_heads);
+#endif
+	ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+	if (ret == 0) {
+		jbd_debug(1, "out of memory for journal_head\n");
+		if (time_after(jiffies, last_warning + 5*HZ)) {
+			printk(KERN_NOTICE "ENOMEM in %s, retrying.\n",
+			       __FUNCTION__);
+			last_warning = jiffies;
+		}
+		while (ret == 0) {
+			yield();
+			ret = kmem_cache_alloc(jbd2_journal_head_cache, GFP_NOFS);
+		}
+	}
+	return ret;
+}
+
+static void journal_free_journal_head(struct journal_head *jh)
+{
+#ifdef CONFIG_JBD_DEBUG
+	atomic_dec(&nr_journal_heads);
+	memset(jh, JBD_POISON_FREE, sizeof(*jh));
+#endif
+	kmem_cache_free(jbd2_journal_head_cache, jh);
+}
+
+/*
+ * A journal_head is attached to a buffer_head whenever JBD has an
+ * interest in the buffer.
+ *
+ * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
+ * is set.  This bit is tested in core kernel code where we need to take
+ * JBD-specific actions.  Testing the zeroness of ->b_private is not reliable
+ * there.
+ *
+ * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
+ *
+ * When a buffer has its BH_JBD bit set it is immune from being released by
+ * core kernel code, mainly via ->b_count.
+ *
+ * A journal_head may be detached from its buffer_head when the journal_head's
+ * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
+ * Various places in JBD call jbd2_journal_remove_journal_head() to indicate that the
+ * journal_head can be dropped if needed.
+ *
+ * Various places in the kernel want to attach a journal_head to a buffer_head
+ * _before_ attaching the journal_head to a transaction.  To protect the
+ * journal_head in this situation, jbd2_journal_add_journal_head elevates the
+ * journal_head's b_jcount refcount by one.  The caller must call
+ * jbd2_journal_put_journal_head() to undo this.
+ *
+ * So the typical usage would be:
+ *
+ *	(Attach a journal_head if needed.  Increments b_jcount)
+ *	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+ *	...
+ *	jh->b_transaction = xxx;
+ *	jbd2_journal_put_journal_head(jh);
+ *
+ * Now, the journal_head's b_jcount is zero, but it is safe from being released
+ * because it has a non-zero b_transaction.
+ */
+
+/*
+ * Give a buffer_head a journal_head.
+ *
+ * Doesn't need the journal lock.
+ * May sleep.
+ */
+struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
+{
+	struct journal_head *jh;
+	struct journal_head *new_jh = NULL;
+
+repeat:
+	if (!buffer_jbd(bh)) {
+		new_jh = journal_alloc_journal_head();
+		memset(new_jh, 0, sizeof(*new_jh));
+	}
+
+	jbd_lock_bh_journal_head(bh);
+	if (buffer_jbd(bh)) {
+		jh = bh2jh(bh);
+	} else {
+		J_ASSERT_BH(bh,
+			(atomic_read(&bh->b_count) > 0) ||
+			(bh->b_page && bh->b_page->mapping));
+
+		if (!new_jh) {
+			jbd_unlock_bh_journal_head(bh);
+			goto repeat;
+		}
+
+		jh = new_jh;
+		new_jh = NULL;		/* We consumed it */
+		set_buffer_jbd(bh);
+		bh->b_private = jh;
+		jh->b_bh = bh;
+		get_bh(bh);
+		BUFFER_TRACE(bh, "added journal_head");
+	}
+	jh->b_jcount++;
+	jbd_unlock_bh_journal_head(bh);
+	if (new_jh)
+		journal_free_journal_head(new_jh);
+	return bh->b_private;
+}
+
+/*
+ * Grab a ref against this buffer_head's journal_head.  If it ended up not
+ * having a journal_head, return NULL
+ */
+struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh)
+{
+	struct journal_head *jh = NULL;
+
+	jbd_lock_bh_journal_head(bh);
+	if (buffer_jbd(bh)) {
+		jh = bh2jh(bh);
+		jh->b_jcount++;
+	}
+	jbd_unlock_bh_journal_head(bh);
+	return jh;
+}
+
+static void __journal_remove_journal_head(struct buffer_head *bh)
+{
+	struct journal_head *jh = bh2jh(bh);
+
+	J_ASSERT_JH(jh, jh->b_jcount >= 0);
+
+	get_bh(bh);
+	if (jh->b_jcount == 0) {
+		if (jh->b_transaction == NULL &&
+				jh->b_next_transaction == NULL &&
+				jh->b_cp_transaction == NULL) {
+			J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
+			J_ASSERT_BH(bh, buffer_jbd(bh));
+			J_ASSERT_BH(bh, jh2bh(jh) == bh);
+			BUFFER_TRACE(bh, "remove journal_head");
+			if (jh->b_frozen_data) {
+				printk(KERN_WARNING "%s: freeing "
+						"b_frozen_data\n",
+						__FUNCTION__);
+				jbd2_slab_free(jh->b_frozen_data, bh->b_size);
+			}
+			if (jh->b_committed_data) {
+				printk(KERN_WARNING "%s: freeing "
+						"b_committed_data\n",
+						__FUNCTION__);
+				jbd2_slab_free(jh->b_committed_data, bh->b_size);
+			}
+			bh->b_private = NULL;
+			jh->b_bh = NULL;	/* debug, really */
+			clear_buffer_jbd(bh);
+			__brelse(bh);
+			journal_free_journal_head(jh);
+		} else {
+			BUFFER_TRACE(bh, "journal_head was locked");
+		}
+	}
+}
+
+/*
+ * jbd2_journal_remove_journal_head(): if the buffer isn't attached to a transaction
+ * and has a zero b_jcount then remove and release its journal_head.   If we did
+ * see that the buffer is not used by any transaction we also "logically"
+ * decrement ->b_count.
+ *
+ * We in fact take an additional increment on ->b_count as a convenience,
+ * because the caller usually wants to do additional things with the bh
+ * after calling here.
+ * The caller of jbd2_journal_remove_journal_head() *must* run __brelse(bh) at some
+ * time.  Once the caller has run __brelse(), the buffer is eligible for
+ * reaping by try_to_free_buffers().
+ */
+void jbd2_journal_remove_journal_head(struct buffer_head *bh)
+{
+	jbd_lock_bh_journal_head(bh);
+	__journal_remove_journal_head(bh);
+	jbd_unlock_bh_journal_head(bh);
+}
+
+/*
+ * Drop a reference on the passed journal_head.  If it fell to zero then try to
+ * release the journal_head from the buffer_head.
+ */
+void jbd2_journal_put_journal_head(struct journal_head *jh)
+{
+	struct buffer_head *bh = jh2bh(jh);
+
+	jbd_lock_bh_journal_head(bh);
+	J_ASSERT_JH(jh, jh->b_jcount > 0);
+	--jh->b_jcount;
+	if (!jh->b_jcount && !jh->b_transaction) {
+		__journal_remove_journal_head(bh);
+		__brelse(bh);
+	}
+	jbd_unlock_bh_journal_head(bh);
+}
+
+/*
+ * /proc tunables
+ */
+#if defined(CONFIG_JBD_DEBUG)
+int jbd2_journal_enable_debug;
+EXPORT_SYMBOL(jbd2_journal_enable_debug);
+#endif
+
+#if defined(CONFIG_JBD_DEBUG) && defined(CONFIG_PROC_FS)
+
+static struct proc_dir_entry *proc_jbd_debug;
+
+static int read_jbd_debug(char *page, char **start, off_t off,
+			  int count, int *eof, void *data)
+{
+	int ret;
+
+	ret = sprintf(page + off, "%d\n", jbd2_journal_enable_debug);
+	*eof = 1;
+	return ret;
+}
+
+static int write_jbd_debug(struct file *file, const char __user *buffer,
+			   unsigned long count, void *data)
+{
+	char buf[32];
+
+	if (count > ARRAY_SIZE(buf) - 1)
+		count = ARRAY_SIZE(buf) - 1;
+	if (copy_from_user(buf, buffer, count))
+		return -EFAULT;
+	buf[ARRAY_SIZE(buf) - 1] = '\0';
+	jbd2_journal_enable_debug = simple_strtoul(buf, NULL, 10);
+	return count;
+}
+
+#define JBD_PROC_NAME "sys/fs/jbd2-debug"
+
+static void __init create_jbd_proc_entry(void)
+{
+	proc_jbd_debug = create_proc_entry(JBD_PROC_NAME, 0644, NULL);
+	if (proc_jbd_debug) {
+		/* Why is this so hard? */
+		proc_jbd_debug->read_proc = read_jbd_debug;
+		proc_jbd_debug->write_proc = write_jbd_debug;
+	}
+}
+
+static void __exit jbd2_remove_jbd_proc_entry(void)
+{
+	if (proc_jbd_debug)
+		remove_proc_entry(JBD_PROC_NAME, NULL);
+}
+
+#else
+
+#define create_jbd_proc_entry() do {} while (0)
+#define jbd2_remove_jbd_proc_entry() do {} while (0)
+
+#endif
+
+kmem_cache_t *jbd2_handle_cache;
+
+static int __init journal_init_handle_cache(void)
+{
+	jbd2_handle_cache = kmem_cache_create("jbd2_journal_handle",
+				sizeof(handle_t),
+				0,		/* offset */
+				0,		/* flags */
+				NULL,		/* ctor */
+				NULL);		/* dtor */
+	if (jbd2_handle_cache == NULL) {
+		printk(KERN_EMERG "JBD: failed to create handle cache\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void jbd2_journal_destroy_handle_cache(void)
+{
+	if (jbd2_handle_cache)
+		kmem_cache_destroy(jbd2_handle_cache);
+}
+
+/*
+ * Module startup and shutdown
+ */
+
+static int __init journal_init_caches(void)
+{
+	int ret;
+
+	ret = jbd2_journal_init_revoke_caches();
+	if (ret == 0)
+		ret = journal_init_jbd2_journal_head_cache();
+	if (ret == 0)
+		ret = journal_init_handle_cache();
+	return ret;
+}
+
+static void jbd2_journal_destroy_caches(void)
+{
+	jbd2_journal_destroy_revoke_caches();
+	jbd2_journal_destroy_jbd2_journal_head_cache();
+	jbd2_journal_destroy_handle_cache();
+	jbd2_journal_destroy_jbd_slabs();
+}
+
+static int __init journal_init(void)
+{
+	int ret;
+
+	BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
+
+	ret = journal_init_caches();
+	if (ret != 0)
+		jbd2_journal_destroy_caches();
+	create_jbd_proc_entry();
+	return ret;
+}
+
+static void __exit journal_exit(void)
+{
+#ifdef CONFIG_JBD_DEBUG
+	int n = atomic_read(&nr_journal_heads);
+	if (n)
+		printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
+#endif
+	jbd2_remove_jbd_proc_entry();
+	jbd2_journal_destroy_caches();
+}
+
+MODULE_LICENSE("GPL");
+module_init(journal_init);
+module_exit(journal_exit);
+
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
new file mode 100644
index 000000000000..9f10acafaf70
--- /dev/null
+++ b/fs/jbd2/recovery.c
@@ -0,0 +1,609 @@
+/*
+ * linux/fs/recovery.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+ *
+ * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal recovery routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ */
+
+#ifndef __KERNEL__
+#include "jfs_user.h"
+#else
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#endif
+
+/*
+ * Maintain information about the progress of the recovery job, so that
+ * the different passes can carry information between them.
+ */
+struct recovery_info
+{
+	tid_t		start_transaction;
+	tid_t		end_transaction;
+
+	int		nr_replays;
+	int		nr_revokes;
+	int		nr_revoke_hits;
+};
+
+enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
+static int do_one_pass(journal_t *journal,
+				struct recovery_info *info, enum passtype pass);
+static int scan_revoke_records(journal_t *, struct buffer_head *,
+				tid_t, struct recovery_info *);
+
+#ifdef __KERNEL__
+
+/* Release readahead buffers after use */
+static void journal_brelse_array(struct buffer_head *b[], int n)
+{
+	while (--n >= 0)
+		brelse (b[n]);
+}
+
+
+/*
+ * When reading from the journal, we are going through the block device
+ * layer directly and so there is no readahead being done for us.  We
+ * need to implement any readahead ourselves if we want it to happen at
+ * all.  Recovery is basically one long sequential read, so make sure we
+ * do the IO in reasonably large chunks.
+ *
+ * This is not so critical that we need to be enormously clever about
+ * the readahead size, though.  128K is a purely arbitrary, good-enough
+ * fixed value.
+ */
+
+#define MAXBUF 8
+static int do_readahead(journal_t *journal, unsigned int start)
+{
+	int err;
+	unsigned int max, nbufs, next;
+	unsigned long long blocknr;
+	struct buffer_head *bh;
+
+	struct buffer_head * bufs[MAXBUF];
+
+	/* Do up to 128K of readahead */
+	max = start + (128 * 1024 / journal->j_blocksize);
+	if (max > journal->j_maxlen)
+		max = journal->j_maxlen;
+
+	/* Do the readahead itself.  We'll submit MAXBUF buffer_heads at
+	 * a time to the block device IO layer. */
+
+	nbufs = 0;
+
+	for (next = start; next < max; next++) {
+		err = jbd2_journal_bmap(journal, next, &blocknr);
+
+		if (err) {
+			printk (KERN_ERR "JBD: bad block at offset %u\n",
+				next);
+			goto failed;
+		}
+
+		bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+		if (!bh) {
+			err = -ENOMEM;
+			goto failed;
+		}
+
+		if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
+			bufs[nbufs++] = bh;
+			if (nbufs == MAXBUF) {
+				ll_rw_block(READ, nbufs, bufs);
+				journal_brelse_array(bufs, nbufs);
+				nbufs = 0;
+			}
+		} else
+			brelse(bh);
+	}
+
+	if (nbufs)
+		ll_rw_block(READ, nbufs, bufs);
+	err = 0;
+
+failed:
+	if (nbufs)
+		journal_brelse_array(bufs, nbufs);
+	return err;
+}
+
+#endif /* __KERNEL__ */
+
+
+/*
+ * Read a block from the journal
+ */
+
+static int jread(struct buffer_head **bhp, journal_t *journal,
+		 unsigned int offset)
+{
+	int err;
+	unsigned long long blocknr;
+	struct buffer_head *bh;
+
+	*bhp = NULL;
+
+	if (offset >= journal->j_maxlen) {
+		printk(KERN_ERR "JBD: corrupted journal superblock\n");
+		return -EIO;
+	}
+
+	err = jbd2_journal_bmap(journal, offset, &blocknr);
+
+	if (err) {
+		printk (KERN_ERR "JBD: bad block at offset %u\n",
+			offset);
+		return err;
+	}
+
+	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+	if (!bh)
+		return -ENOMEM;
+
+	if (!buffer_uptodate(bh)) {
+		/* If this is a brand new buffer, start readahead.
+                   Otherwise, we assume we are already reading it.  */
+		if (!buffer_req(bh))
+			do_readahead(journal, offset);
+		wait_on_buffer(bh);
+	}
+
+	if (!buffer_uptodate(bh)) {
+		printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
+			offset);
+		brelse(bh);
+		return -EIO;
+	}
+
+	*bhp = bh;
+	return 0;
+}
+
+
+/*
+ * Count the number of in-use tags in a journal descriptor block.
+ */
+
+static int count_tags(journal_t *journal, struct buffer_head *bh)
+{
+	char *			tagp;
+	journal_block_tag_t *	tag;
+	int			nr = 0, size = journal->j_blocksize;
+	int			tag_bytes = journal_tag_bytes(journal);
+
+	tagp = &bh->b_data[sizeof(journal_header_t)];
+
+	while ((tagp - bh->b_data + tag_bytes) <= size) {
+		tag = (journal_block_tag_t *) tagp;
+
+		nr++;
+		tagp += tag_bytes;
+		if (!(tag->t_flags & cpu_to_be32(JBD2_FLAG_SAME_UUID)))
+			tagp += 16;
+
+		if (tag->t_flags & cpu_to_be32(JBD2_FLAG_LAST_TAG))
+			break;
+	}
+
+	return nr;
+}
+
+
+/* Make sure we wrap around the log correctly! */
+#define wrap(journal, var)						\
+do {									\
+	if (var >= (journal)->j_last)					\
+		var -= ((journal)->j_last - (journal)->j_first);	\
+} while (0)
+
+/**
+ * jbd2_journal_recover - recovers a on-disk journal
+ * @journal: the journal to recover
+ *
+ * The primary function for recovering the log contents when mounting a
+ * journaled device.
+ *
+ * Recovery is done in three passes.  In the first pass, we look for the
+ * end of the log.  In the second, we assemble the list of revoke
+ * blocks.  In the third and final pass, we replay any un-revoked blocks
+ * in the log.
+ */
+int jbd2_journal_recover(journal_t *journal)
+{
+	int			err;
+	journal_superblock_t *	sb;
+
+	struct recovery_info	info;
+
+	memset(&info, 0, sizeof(info));
+	sb = journal->j_superblock;
+
+	/*
+	 * The journal superblock's s_start field (the current log head)
+	 * is always zero if, and only if, the journal was cleanly
+	 * unmounted.
+	 */
+
+	if (!sb->s_start) {
+		jbd_debug(1, "No recovery required, last transaction %d\n",
+			  be32_to_cpu(sb->s_sequence));
+		journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
+		return 0;
+	}
+
+	err = do_one_pass(journal, &info, PASS_SCAN);
+	if (!err)
+		err = do_one_pass(journal, &info, PASS_REVOKE);
+	if (!err)
+		err = do_one_pass(journal, &info, PASS_REPLAY);
+
+	jbd_debug(0, "JBD: recovery, exit status %d, "
+		  "recovered transactions %u to %u\n",
+		  err, info.start_transaction, info.end_transaction);
+	jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n",
+		  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
+
+	/* Restart the log at the next transaction ID, thus invalidating
+	 * any existing commit records in the log. */
+	journal->j_transaction_sequence = ++info.end_transaction;
+
+	jbd2_journal_clear_revoke(journal);
+	sync_blockdev(journal->j_fs_dev);
+	return err;
+}
+
+/**
+ * jbd2_journal_skip_recovery - Start journal and wipe exiting records
+ * @journal: journal to startup
+ *
+ * Locate any valid recovery information from the journal and set up the
+ * journal structures in memory to ignore it (presumably because the
+ * caller has evidence that it is out of date).
+ * This function does'nt appear to be exorted..
+ *
+ * We perform one pass over the journal to allow us to tell the user how
+ * much recovery information is being erased, and to let us initialise
+ * the journal transaction sequence numbers to the next unused ID.
+ */
+int jbd2_journal_skip_recovery(journal_t *journal)
+{
+	int			err;
+	journal_superblock_t *	sb;
+
+	struct recovery_info	info;
+
+	memset (&info, 0, sizeof(info));
+	sb = journal->j_superblock;
+
+	err = do_one_pass(journal, &info, PASS_SCAN);
+
+	if (err) {
+		printk(KERN_ERR "JBD: error %d scanning journal\n", err);
+		++journal->j_transaction_sequence;
+	} else {
+#ifdef CONFIG_JBD_DEBUG
+		int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence);
+#endif
+		jbd_debug(0,
+			  "JBD: ignoring %d transaction%s from the journal.\n",
+			  dropped, (dropped == 1) ? "" : "s");
+		journal->j_transaction_sequence = ++info.end_transaction;
+	}
+
+	journal->j_tail = 0;
+	return err;
+}
+
+static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
+{
+	unsigned long long block = be32_to_cpu(tag->t_blocknr);
+	if (tag_bytes > JBD_TAG_SIZE32)
+		block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
+	return block;
+}
+
+static int do_one_pass(journal_t *journal,
+			struct recovery_info *info, enum passtype pass)
+{
+	unsigned int		first_commit_ID, next_commit_ID;
+	unsigned long		next_log_block;
+	int			err, success = 0;
+	journal_superblock_t *	sb;
+	journal_header_t *	tmp;
+	struct buffer_head *	bh;
+	unsigned int		sequence;
+	int			blocktype;
+	int			tag_bytes = journal_tag_bytes(journal);
+
+	/* Precompute the maximum metadata descriptors in a descriptor block */
+	int			MAX_BLOCKS_PER_DESC;
+	MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
+			       / tag_bytes);
+
+	/*
+	 * First thing is to establish what we expect to find in the log
+	 * (in terms of transaction IDs), and where (in terms of log
+	 * block offsets): query the superblock.
+	 */
+
+	sb = journal->j_superblock;
+	next_commit_ID = be32_to_cpu(sb->s_sequence);
+	next_log_block = be32_to_cpu(sb->s_start);
+
+	first_commit_ID = next_commit_ID;
+	if (pass == PASS_SCAN)
+		info->start_transaction = first_commit_ID;
+
+	jbd_debug(1, "Starting recovery pass %d\n", pass);
+
+	/*
+	 * Now we walk through the log, transaction by transaction,
+	 * making sure that each transaction has a commit block in the
+	 * expected place.  Each complete transaction gets replayed back
+	 * into the main filesystem.
+	 */
+
+	while (1) {
+		int			flags;
+		char *			tagp;
+		journal_block_tag_t *	tag;
+		struct buffer_head *	obh;
+		struct buffer_head *	nbh;
+
+		cond_resched();		/* We're under lock_kernel() */
+
+		/* If we already know where to stop the log traversal,
+		 * check right now that we haven't gone past the end of
+		 * the log. */
+
+		if (pass != PASS_SCAN)
+			if (tid_geq(next_commit_ID, info->end_transaction))
+				break;
+
+		jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+			  next_commit_ID, next_log_block, journal->j_last);
+
+		/* Skip over each chunk of the transaction looking
+		 * either the next descriptor block or the final commit
+		 * record. */
+
+		jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+		err = jread(&bh, journal, next_log_block);
+		if (err)
+			goto failed;
+
+		next_log_block++;
+		wrap(journal, next_log_block);
+
+		/* What kind of buffer is it?
+		 *
+		 * If it is a descriptor block, check that it has the
+		 * expected sequence number.  Otherwise, we're all done
+		 * here. */
+
+		tmp = (journal_header_t *)bh->b_data;
+
+		if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) {
+			brelse(bh);
+			break;
+		}
+
+		blocktype = be32_to_cpu(tmp->h_blocktype);
+		sequence = be32_to_cpu(tmp->h_sequence);
+		jbd_debug(3, "Found magic %d, sequence %d\n",
+			  blocktype, sequence);
+
+		if (sequence != next_commit_ID) {
+			brelse(bh);
+			break;
+		}
+
+		/* OK, we have a valid descriptor block which matches
+		 * all of the sequence number checks.  What are we going
+		 * to do with it?  That depends on the pass... */
+
+		switch(blocktype) {
+		case JBD2_DESCRIPTOR_BLOCK:
+			/* If it is a valid descriptor block, replay it
+			 * in pass REPLAY; otherwise, just skip over the
+			 * blocks it describes. */
+			if (pass != PASS_REPLAY) {
+				next_log_block += count_tags(journal, bh);
+				wrap(journal, next_log_block);
+				brelse(bh);
+				continue;
+			}
+
+			/* A descriptor block: we can now write all of
+			 * the data blocks.  Yay, useful work is finally
+			 * getting done here! */
+
+			tagp = &bh->b_data[sizeof(journal_header_t)];
+			while ((tagp - bh->b_data + tag_bytes)
+			       <= journal->j_blocksize) {
+				unsigned long io_block;
+
+				tag = (journal_block_tag_t *) tagp;
+				flags = be32_to_cpu(tag->t_flags);
+
+				io_block = next_log_block++;
+				wrap(journal, next_log_block);
+				err = jread(&obh, journal, io_block);
+				if (err) {
+					/* Recover what we can, but
+					 * report failure at the end. */
+					success = err;
+					printk (KERN_ERR
+						"JBD: IO error %d recovering "
+						"block %ld in log\n",
+						err, io_block);
+				} else {
+					unsigned long long blocknr;
+
+					J_ASSERT(obh != NULL);
+					blocknr = read_tag_block(tag_bytes,
+								 tag);
+
+					/* If the block has been
+					 * revoked, then we're all done
+					 * here. */
+					if (jbd2_journal_test_revoke
+					    (journal, blocknr,
+					     next_commit_ID)) {
+						brelse(obh);
+						++info->nr_revoke_hits;
+						goto skip_write;
+					}
+
+					/* Find a buffer for the new
+					 * data being restored */
+					nbh = __getblk(journal->j_fs_dev,
+							blocknr,
+							journal->j_blocksize);
+					if (nbh == NULL) {
+						printk(KERN_ERR
+						       "JBD: Out of memory "
+						       "during recovery.\n");
+						err = -ENOMEM;
+						brelse(bh);
+						brelse(obh);
+						goto failed;
+					}
+
+					lock_buffer(nbh);
+					memcpy(nbh->b_data, obh->b_data,
+							journal->j_blocksize);
+					if (flags & JBD2_FLAG_ESCAPE) {
+						*((__be32 *)bh->b_data) =
+						cpu_to_be32(JBD2_MAGIC_NUMBER);
+					}
+
+					BUFFER_TRACE(nbh, "marking dirty");
+					set_buffer_uptodate(nbh);
+					mark_buffer_dirty(nbh);
+					BUFFER_TRACE(nbh, "marking uptodate");
+					++info->nr_replays;
+					/* ll_rw_block(WRITE, 1, &nbh); */
+					unlock_buffer(nbh);
+					brelse(obh);
+					brelse(nbh);
+				}
+
+			skip_write:
+				tagp += tag_bytes;
+				if (!(flags & JBD2_FLAG_SAME_UUID))
+					tagp += 16;
+
+				if (flags & JBD2_FLAG_LAST_TAG)
+					break;
+			}
+
+			brelse(bh);
+			continue;
+
+		case JBD2_COMMIT_BLOCK:
+			/* Found an expected commit block: not much to
+			 * do other than move on to the next sequence
+			 * number. */
+			brelse(bh);
+			next_commit_ID++;
+			continue;
+
+		case JBD2_REVOKE_BLOCK:
+			/* If we aren't in the REVOKE pass, then we can
+			 * just skip over this block. */
+			if (pass != PASS_REVOKE) {
+				brelse(bh);
+				continue;
+			}
+
+			err = scan_revoke_records(journal, bh,
+						  next_commit_ID, info);
+			brelse(bh);
+			if (err)
+				goto failed;
+			continue;
+
+		default:
+			jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
+				  blocktype);
+			brelse(bh);
+			goto done;
+		}
+	}
+
+ done:
+	/*
+	 * We broke out of the log scan loop: either we came to the
+	 * known end of the log or we found an unexpected block in the
+	 * log.  If the latter happened, then we know that the "current"
+	 * transaction marks the end of the valid log.
+	 */
+
+	if (pass == PASS_SCAN)
+		info->end_transaction = next_commit_ID;
+	else {
+		/* It's really bad news if different passes end up at
+		 * different places (but possible due to IO errors). */
+		if (info->end_transaction != next_commit_ID) {
+			printk (KERN_ERR "JBD: recovery pass %d ended at "
+				"transaction %u, expected %u\n",
+				pass, next_commit_ID, info->end_transaction);
+			if (!success)
+				success = -EIO;
+		}
+	}
+
+	return success;
+
+ failed:
+	return err;
+}
+
+
+/* Scan a revoke record, marking all blocks mentioned as revoked. */
+
+static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
+			       tid_t sequence, struct recovery_info *info)
+{
+	jbd2_journal_revoke_header_t *header;
+	int offset, max;
+	int record_len = 4;
+
+	header = (jbd2_journal_revoke_header_t *) bh->b_data;
+	offset = sizeof(jbd2_journal_revoke_header_t);
+	max = be32_to_cpu(header->r_count);
+
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+		record_len = 8;
+
+	while (offset + record_len <= max) {
+		unsigned long long blocknr;
+		int err;
+
+		if (record_len == 4)
+			blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
+		else
+			blocknr = be64_to_cpu(* ((__be64 *) (bh->b_data+offset)));
+		offset += record_len;
+		err = jbd2_journal_set_revoke(journal, blocknr, sequence);
+		if (err)
+			return err;
+		++info->nr_revokes;
+	}
+	return 0;
+}
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
new file mode 100644
index 000000000000..380d19917f37
--- /dev/null
+++ b/fs/jbd2/revoke.c
@@ -0,0 +1,712 @@
+/*
+ * linux/fs/revoke.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
+ *
+ * Copyright 2000 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Journal revoke routines for the generic filesystem journaling code;
+ * part of the ext2fs journaling system.
+ *
+ * Revoke is the mechanism used to prevent old log records for deleted
+ * metadata from being replayed on top of newer data using the same
+ * blocks.  The revoke mechanism is used in two separate places:
+ *
+ * + Commit: during commit we write the entire list of the current
+ *   transaction's revoked blocks to the journal
+ *
+ * + Recovery: during recovery we record the transaction ID of all
+ *   revoked blocks.  If there are multiple revoke records in the log
+ *   for a single block, only the last one counts, and if there is a log
+ *   entry for a block beyond the last revoke, then that log entry still
+ *   gets replayed.
+ *
+ * We can get interactions between revokes and new log data within a
+ * single transaction:
+ *
+ * Block is revoked and then journaled:
+ *   The desired end result is the journaling of the new block, so we
+ *   cancel the revoke before the transaction commits.
+ *
+ * Block is journaled and then revoked:
+ *   The revoke must take precedence over the write of the block, so we
+ *   need either to cancel the journal entry or to write the revoke
+ *   later in the log than the log block.  In this case, we choose the
+ *   latter: journaling a block cancels any revoke record for that block
+ *   in the current transaction, so any revoke for that block in the
+ *   transaction must have happened after the block was journaled and so
+ *   the revoke must take precedence.
+ *
+ * Block is revoked and then written as data:
+ *   The data write is allowed to succeed, but the revoke is _not_
+ *   cancelled.  We still need to prevent old log records from
+ *   overwriting the new data.  We don't even need to clear the revoke
+ *   bit here.
+ *
+ * Revoke information on buffers is a tri-state value:
+ *
+ * RevokeValid clear:	no cached revoke status, need to look it up
+ * RevokeValid set, Revoked clear:
+ *			buffer has not been revoked, and cancel_revoke
+ *			need do nothing.
+ * RevokeValid set, Revoked set:
+ *			buffer has been revoked.
+ */
+
+#ifndef __KERNEL__
+#include "jfs_user.h"
+#else
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#endif
+
+static kmem_cache_t *jbd2_revoke_record_cache;
+static kmem_cache_t *jbd2_revoke_table_cache;
+
+/* Each revoke record represents one single revoked block.  During
+   journal replay, this involves recording the transaction ID of the
+   last transaction to revoke this block. */
+
+struct jbd2_revoke_record_s
+{
+	struct list_head  hash;
+	tid_t		  sequence;	/* Used for recovery only */
+	unsigned long long	  blocknr;
+};
+
+
+/* The revoke table is just a simple hash table of revoke records. */
+struct jbd2_revoke_table_s
+{
+	/* It is conceivable that we might want a larger hash table
+	 * for recovery.  Must be a power of two. */
+	int		  hash_size;
+	int		  hash_shift;
+	struct list_head *hash_table;
+};
+
+
+#ifdef __KERNEL__
+static void write_one_revoke_record(journal_t *, transaction_t *,
+				    struct journal_head **, int *,
+				    struct jbd2_revoke_record_s *);
+static void flush_descriptor(journal_t *, struct journal_head *, int);
+#endif
+
+/* Utility functions to maintain the revoke table */
+
+/* Borrowed from buffer.c: this is a tried and tested block hash function */
+static inline int hash(journal_t *journal, unsigned long long block)
+{
+	struct jbd2_revoke_table_s *table = journal->j_revoke;
+	int hash_shift = table->hash_shift;
+	int hash = (int)block ^ (int)((block >> 31) >> 1);
+
+	return ((hash << (hash_shift - 6)) ^
+		(hash >> 13) ^
+		(hash << (hash_shift - 12))) & (table->hash_size - 1);
+}
+
+static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
+			      tid_t seq)
+{
+	struct list_head *hash_list;
+	struct jbd2_revoke_record_s *record;
+
+repeat:
+	record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS);
+	if (!record)
+		goto oom;
+
+	record->sequence = seq;
+	record->blocknr = blocknr;
+	hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+	spin_lock(&journal->j_revoke_lock);
+	list_add(&record->hash, hash_list);
+	spin_unlock(&journal->j_revoke_lock);
+	return 0;
+
+oom:
+	if (!journal_oom_retry)
+		return -ENOMEM;
+	jbd_debug(1, "ENOMEM in %s, retrying\n", __FUNCTION__);
+	yield();
+	goto repeat;
+}
+
+/* Find a revoke record in the journal's hash table. */
+
+static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal,
+						      unsigned long long blocknr)
+{
+	struct list_head *hash_list;
+	struct jbd2_revoke_record_s *record;
+
+	hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+
+	spin_lock(&journal->j_revoke_lock);
+	record = (struct jbd2_revoke_record_s *) hash_list->next;
+	while (&(record->hash) != hash_list) {
+		if (record->blocknr == blocknr) {
+			spin_unlock(&journal->j_revoke_lock);
+			return record;
+		}
+		record = (struct jbd2_revoke_record_s *) record->hash.next;
+	}
+	spin_unlock(&journal->j_revoke_lock);
+	return NULL;
+}
+
+int __init jbd2_journal_init_revoke_caches(void)
+{
+	jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
+					   sizeof(struct jbd2_revoke_record_s),
+					   0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (jbd2_revoke_record_cache == 0)
+		return -ENOMEM;
+
+	jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
+					   sizeof(struct jbd2_revoke_table_s),
+					   0, 0, NULL, NULL);
+	if (jbd2_revoke_table_cache == 0) {
+		kmem_cache_destroy(jbd2_revoke_record_cache);
+		jbd2_revoke_record_cache = NULL;
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void jbd2_journal_destroy_revoke_caches(void)
+{
+	kmem_cache_destroy(jbd2_revoke_record_cache);
+	jbd2_revoke_record_cache = NULL;
+	kmem_cache_destroy(jbd2_revoke_table_cache);
+	jbd2_revoke_table_cache = NULL;
+}
+
+/* Initialise the revoke table for a given journal to a given size. */
+
+int jbd2_journal_init_revoke(journal_t *journal, int hash_size)
+{
+	int shift, tmp;
+
+	J_ASSERT (journal->j_revoke_table[0] == NULL);
+
+	shift = 0;
+	tmp = hash_size;
+	while((tmp >>= 1UL) != 0UL)
+		shift++;
+
+	journal->j_revoke_table[0] = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
+	if (!journal->j_revoke_table[0])
+		return -ENOMEM;
+	journal->j_revoke = journal->j_revoke_table[0];
+
+	/* Check that the hash_size is a power of two */
+	J_ASSERT ((hash_size & (hash_size-1)) == 0);
+
+	journal->j_revoke->hash_size = hash_size;
+
+	journal->j_revoke->hash_shift = shift;
+
+	journal->j_revoke->hash_table =
+		kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
+	if (!journal->j_revoke->hash_table) {
+		kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
+		journal->j_revoke = NULL;
+		return -ENOMEM;
+	}
+
+	for (tmp = 0; tmp < hash_size; tmp++)
+		INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
+
+	journal->j_revoke_table[1] = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL);
+	if (!journal->j_revoke_table[1]) {
+		kfree(journal->j_revoke_table[0]->hash_table);
+		kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
+		return -ENOMEM;
+	}
+
+	journal->j_revoke = journal->j_revoke_table[1];
+
+	/* Check that the hash_size is a power of two */
+	J_ASSERT ((hash_size & (hash_size-1)) == 0);
+
+	journal->j_revoke->hash_size = hash_size;
+
+	journal->j_revoke->hash_shift = shift;
+
+	journal->j_revoke->hash_table =
+		kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
+	if (!journal->j_revoke->hash_table) {
+		kfree(journal->j_revoke_table[0]->hash_table);
+		kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]);
+		kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[1]);
+		journal->j_revoke = NULL;
+		return -ENOMEM;
+	}
+
+	for (tmp = 0; tmp < hash_size; tmp++)
+		INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
+
+	spin_lock_init(&journal->j_revoke_lock);
+
+	return 0;
+}
+
+/* Destoy a journal's revoke table.  The table must already be empty! */
+
+void jbd2_journal_destroy_revoke(journal_t *journal)
+{
+	struct jbd2_revoke_table_s *table;
+	struct list_head *hash_list;
+	int i;
+
+	table = journal->j_revoke_table[0];
+	if (!table)
+		return;
+
+	for (i=0; i<table->hash_size; i++) {
+		hash_list = &table->hash_table[i];
+		J_ASSERT (list_empty(hash_list));
+	}
+
+	kfree(table->hash_table);
+	kmem_cache_free(jbd2_revoke_table_cache, table);
+	journal->j_revoke = NULL;
+
+	table = journal->j_revoke_table[1];
+	if (!table)
+		return;
+
+	for (i=0; i<table->hash_size; i++) {
+		hash_list = &table->hash_table[i];
+		J_ASSERT (list_empty(hash_list));
+	}
+
+	kfree(table->hash_table);
+	kmem_cache_free(jbd2_revoke_table_cache, table);
+	journal->j_revoke = NULL;
+}
+
+
+#ifdef __KERNEL__
+
+/*
+ * jbd2_journal_revoke: revoke a given buffer_head from the journal.  This
+ * prevents the block from being replayed during recovery if we take a
+ * crash after this current transaction commits.  Any subsequent
+ * metadata writes of the buffer in this transaction cancel the
+ * revoke.
+ *
+ * Note that this call may block --- it is up to the caller to make
+ * sure that there are no further calls to journal_write_metadata
+ * before the revoke is complete.  In ext3, this implies calling the
+ * revoke before clearing the block bitmap when we are deleting
+ * metadata.
+ *
+ * Revoke performs a jbd2_journal_forget on any buffer_head passed in as a
+ * parameter, but does _not_ forget the buffer_head if the bh was only
+ * found implicitly.
+ *
+ * bh_in may not be a journalled buffer - it may have come off
+ * the hash tables without an attached journal_head.
+ *
+ * If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count
+ * by one.
+ */
+
+int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
+		   struct buffer_head *bh_in)
+{
+	struct buffer_head *bh = NULL;
+	journal_t *journal;
+	struct block_device *bdev;
+	int err;
+
+	might_sleep();
+	if (bh_in)
+		BUFFER_TRACE(bh_in, "enter");
+
+	journal = handle->h_transaction->t_journal;
+	if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){
+		J_ASSERT (!"Cannot set revoke feature!");
+		return -EINVAL;
+	}
+
+	bdev = journal->j_fs_dev;
+	bh = bh_in;
+
+	if (!bh) {
+		bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
+		if (bh)
+			BUFFER_TRACE(bh, "found on hash");
+	}
+#ifdef JBD_EXPENSIVE_CHECKING
+	else {
+		struct buffer_head *bh2;
+
+		/* If there is a different buffer_head lying around in
+		 * memory anywhere... */
+		bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
+		if (bh2) {
+			/* ... and it has RevokeValid status... */
+			if (bh2 != bh && buffer_revokevalid(bh2))
+				/* ...then it better be revoked too,
+				 * since it's illegal to create a revoke
+				 * record against a buffer_head which is
+				 * not marked revoked --- that would
+				 * risk missing a subsequent revoke
+				 * cancel. */
+				J_ASSERT_BH(bh2, buffer_revoked(bh2));
+			put_bh(bh2);
+		}
+	}
+#endif
+
+	/* We really ought not ever to revoke twice in a row without
+           first having the revoke cancelled: it's illegal to free a
+           block twice without allocating it in between! */
+	if (bh) {
+		if (!J_EXPECT_BH(bh, !buffer_revoked(bh),
+				 "inconsistent data on disk")) {
+			if (!bh_in)
+				brelse(bh);
+			return -EIO;
+		}
+		set_buffer_revoked(bh);
+		set_buffer_revokevalid(bh);
+		if (bh_in) {
+			BUFFER_TRACE(bh_in, "call jbd2_journal_forget");
+			jbd2_journal_forget(handle, bh_in);
+		} else {
+			BUFFER_TRACE(bh, "call brelse");
+			__brelse(bh);
+		}
+	}
+
+	jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in);
+	err = insert_revoke_hash(journal, blocknr,
+				handle->h_transaction->t_tid);
+	BUFFER_TRACE(bh_in, "exit");
+	return err;
+}
+
+/*
+ * Cancel an outstanding revoke.  For use only internally by the
+ * journaling code (called from jbd2_journal_get_write_access).
+ *
+ * We trust buffer_revoked() on the buffer if the buffer is already
+ * being journaled: if there is no revoke pending on the buffer, then we
+ * don't do anything here.
+ *
+ * This would break if it were possible for a buffer to be revoked and
+ * discarded, and then reallocated within the same transaction.  In such
+ * a case we would have lost the revoked bit, but when we arrived here
+ * the second time we would still have a pending revoke to cancel.  So,
+ * do not trust the Revoked bit on buffers unless RevokeValid is also
+ * set.
+ *
+ * The caller must have the journal locked.
+ */
+int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
+{
+	struct jbd2_revoke_record_s *record;
+	journal_t *journal = handle->h_transaction->t_journal;
+	int need_cancel;
+	int did_revoke = 0;	/* akpm: debug */
+	struct buffer_head *bh = jh2bh(jh);
+
+	jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
+
+	/* Is the existing Revoke bit valid?  If so, we trust it, and
+	 * only perform the full cancel if the revoke bit is set.  If
+	 * not, we can't trust the revoke bit, and we need to do the
+	 * full search for a revoke record. */
+	if (test_set_buffer_revokevalid(bh)) {
+		need_cancel = test_clear_buffer_revoked(bh);
+	} else {
+		need_cancel = 1;
+		clear_buffer_revoked(bh);
+	}
+
+	if (need_cancel) {
+		record = find_revoke_record(journal, bh->b_blocknr);
+		if (record) {
+			jbd_debug(4, "cancelled existing revoke on "
+				  "blocknr %llu\n", (unsigned long long)bh->b_blocknr);
+			spin_lock(&journal->j_revoke_lock);
+			list_del(&record->hash);
+			spin_unlock(&journal->j_revoke_lock);
+			kmem_cache_free(jbd2_revoke_record_cache, record);
+			did_revoke = 1;
+		}
+	}
+
+#ifdef JBD_EXPENSIVE_CHECKING
+	/* There better not be one left behind by now! */
+	record = find_revoke_record(journal, bh->b_blocknr);
+	J_ASSERT_JH(jh, record == NULL);
+#endif
+
+	/* Finally, have we just cleared revoke on an unhashed
+	 * buffer_head?  If so, we'd better make sure we clear the
+	 * revoked status on any hashed alias too, otherwise the revoke
+	 * state machine will get very upset later on. */
+	if (need_cancel) {
+		struct buffer_head *bh2;
+		bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
+		if (bh2) {
+			if (bh2 != bh)
+				clear_buffer_revoked(bh2);
+			__brelse(bh2);
+		}
+	}
+	return did_revoke;
+}
+
+/* journal_switch_revoke table select j_revoke for next transaction
+ * we do not want to suspend any processing until all revokes are
+ * written -bzzz
+ */
+void jbd2_journal_switch_revoke_table(journal_t *journal)
+{
+	int i;
+
+	if (journal->j_revoke == journal->j_revoke_table[0])
+		journal->j_revoke = journal->j_revoke_table[1];
+	else
+		journal->j_revoke = journal->j_revoke_table[0];
+
+	for (i = 0; i < journal->j_revoke->hash_size; i++)
+		INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]);
+}
+
+/*
+ * Write revoke records to the journal for all entries in the current
+ * revoke hash, deleting the entries as we go.
+ *
+ * Called with the journal lock held.
+ */
+
+void jbd2_journal_write_revoke_records(journal_t *journal,
+				  transaction_t *transaction)
+{
+	struct journal_head *descriptor;
+	struct jbd2_revoke_record_s *record;
+	struct jbd2_revoke_table_s *revoke;
+	struct list_head *hash_list;
+	int i, offset, count;
+
+	descriptor = NULL;
+	offset = 0;
+	count = 0;
+
+	/* select revoke table for committing transaction */
+	revoke = journal->j_revoke == journal->j_revoke_table[0] ?
+		journal->j_revoke_table[1] : journal->j_revoke_table[0];
+
+	for (i = 0; i < revoke->hash_size; i++) {
+		hash_list = &revoke->hash_table[i];
+
+		while (!list_empty(hash_list)) {
+			record = (struct jbd2_revoke_record_s *)
+				hash_list->next;
+			write_one_revoke_record(journal, transaction,
+						&descriptor, &offset,
+						record);
+			count++;
+			list_del(&record->hash);
+			kmem_cache_free(jbd2_revoke_record_cache, record);
+		}
+	}
+	if (descriptor)
+		flush_descriptor(journal, descriptor, offset);
+	jbd_debug(1, "Wrote %d revoke records\n", count);
+}
+
+/*
+ * Write out one revoke record.  We need to create a new descriptor
+ * block if the old one is full or if we have not already created one.
+ */
+
+static void write_one_revoke_record(journal_t *journal,
+				    transaction_t *transaction,
+				    struct journal_head **descriptorp,
+				    int *offsetp,
+				    struct jbd2_revoke_record_s *record)
+{
+	struct journal_head *descriptor;
+	int offset;
+	journal_header_t *header;
+
+	/* If we are already aborting, this all becomes a noop.  We
+           still need to go round the loop in
+           jbd2_journal_write_revoke_records in order to free all of the
+           revoke records: only the IO to the journal is omitted. */
+	if (is_journal_aborted(journal))
+		return;
+
+	descriptor = *descriptorp;
+	offset = *offsetp;
+
+	/* Make sure we have a descriptor with space left for the record */
+	if (descriptor) {
+		if (offset == journal->j_blocksize) {
+			flush_descriptor(journal, descriptor, offset);
+			descriptor = NULL;
+		}
+	}
+
+	if (!descriptor) {
+		descriptor = jbd2_journal_get_descriptor_buffer(journal);
+		if (!descriptor)
+			return;
+		header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
+		header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
+		header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
+		header->h_sequence  = cpu_to_be32(transaction->t_tid);
+
+		/* Record it so that we can wait for IO completion later */
+		JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
+		jbd2_journal_file_buffer(descriptor, transaction, BJ_LogCtl);
+
+		offset = sizeof(jbd2_journal_revoke_header_t);
+		*descriptorp = descriptor;
+	}
+
+	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) {
+		* ((__be64 *)(&jh2bh(descriptor)->b_data[offset])) =
+			cpu_to_be64(record->blocknr);
+		offset += 8;
+
+	} else {
+		* ((__be32 *)(&jh2bh(descriptor)->b_data[offset])) =
+			cpu_to_be32(record->blocknr);
+		offset += 4;
+	}
+
+	*offsetp = offset;
+}
+
+/*
+ * Flush a revoke descriptor out to the journal.  If we are aborting,
+ * this is a noop; otherwise we are generating a buffer which needs to
+ * be waited for during commit, so it has to go onto the appropriate
+ * journal buffer list.
+ */
+
+static void flush_descriptor(journal_t *journal,
+			     struct journal_head *descriptor,
+			     int offset)
+{
+	jbd2_journal_revoke_header_t *header;
+	struct buffer_head *bh = jh2bh(descriptor);
+
+	if (is_journal_aborted(journal)) {
+		put_bh(bh);
+		return;
+	}
+
+	header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
+	header->r_count = cpu_to_be32(offset);
+	set_buffer_jwrite(bh);
+	BUFFER_TRACE(bh, "write");
+	set_buffer_dirty(bh);
+	ll_rw_block(SWRITE, 1, &bh);
+}
+#endif
+
+/*
+ * Revoke support for recovery.
+ *
+ * Recovery needs to be able to:
+ *
+ *  record all revoke records, including the tid of the latest instance
+ *  of each revoke in the journal
+ *
+ *  check whether a given block in a given transaction should be replayed
+ *  (ie. has not been revoked by a revoke record in that or a subsequent
+ *  transaction)
+ *
+ *  empty the revoke table after recovery.
+ */
+
+/*
+ * First, setting revoke records.  We create a new revoke record for
+ * every block ever revoked in the log as we scan it for recovery, and
+ * we update the existing records if we find multiple revokes for a
+ * single block.
+ */
+
+int jbd2_journal_set_revoke(journal_t *journal,
+		       unsigned long long blocknr,
+		       tid_t sequence)
+{
+	struct jbd2_revoke_record_s *record;
+
+	record = find_revoke_record(journal, blocknr);
+	if (record) {
+		/* If we have multiple occurrences, only record the
+		 * latest sequence number in the hashed record */
+		if (tid_gt(sequence, record->sequence))
+			record->sequence = sequence;
+		return 0;
+	}
+	return insert_revoke_hash(journal, blocknr, sequence);
+}
+
+/*
+ * Test revoke records.  For a given block referenced in the log, has
+ * that block been revoked?  A revoke record with a given transaction
+ * sequence number revokes all blocks in that transaction and earlier
+ * ones, but later transactions still need replayed.
+ */
+
+int jbd2_journal_test_revoke(journal_t *journal,
+			unsigned long long blocknr,
+			tid_t sequence)
+{
+	struct jbd2_revoke_record_s *record;
+
+	record = find_revoke_record(journal, blocknr);
+	if (!record)
+		return 0;
+	if (tid_gt(sequence, record->sequence))
+		return 0;
+	return 1;
+}
+
+/*
+ * Finally, once recovery is over, we need to clear the revoke table so
+ * that it can be reused by the running filesystem.
+ */
+
+void jbd2_journal_clear_revoke(journal_t *journal)
+{
+	int i;
+	struct list_head *hash_list;
+	struct jbd2_revoke_record_s *record;
+	struct jbd2_revoke_table_s *revoke;
+
+	revoke = journal->j_revoke;
+
+	for (i = 0; i < revoke->hash_size; i++) {
+		hash_list = &revoke->hash_table[i];
+		while (!list_empty(hash_list)) {
+			record = (struct jbd2_revoke_record_s*) hash_list->next;
+			list_del(&record->hash);
+			kmem_cache_free(jbd2_revoke_record_cache, record);
+		}
+	}
+}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
new file mode 100644
index 000000000000..b6cf2be845a1
--- /dev/null
+++ b/fs/jbd2/transaction.c
@@ -0,0 +1,2081 @@
+/*
+ * linux/fs/transaction.c
+ *
+ * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+ *
+ * Copyright 1998 Red Hat corp --- All Rights Reserved
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ *
+ * Generic filesystem transaction handling code; part of the ext2fs
+ * journaling system.
+ *
+ * This file manages transactions (compound commits managed by the
+ * journaling code) and handles (individual atomic operations by the
+ * filesystem).
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/smp_lock.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+
+/*
+ * jbd2_get_transaction: obtain a new transaction_t object.
+ *
+ * Simply allocate and initialise a new transaction.  Create it in
+ * RUNNING state and add it to the current journal (which should not
+ * have an existing running transaction: we only make a new transaction
+ * once we have started to commit the old one).
+ *
+ * Preconditions:
+ *	The journal MUST be locked.  We don't perform atomic mallocs on the
+ *	new transaction	and we can't block without protecting against other
+ *	processes trying to touch the journal while it is in transition.
+ *
+ * Called under j_state_lock
+ */
+
+static transaction_t *
+jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
+{
+	transaction->t_journal = journal;
+	transaction->t_state = T_RUNNING;
+	transaction->t_tid = journal->j_transaction_sequence++;
+	transaction->t_expires = jiffies + journal->j_commit_interval;
+	spin_lock_init(&transaction->t_handle_lock);
+
+	/* Set up the commit timer for the new transaction. */
+	journal->j_commit_timer.expires = transaction->t_expires;
+	add_timer(&journal->j_commit_timer);
+
+	J_ASSERT(journal->j_running_transaction == NULL);
+	journal->j_running_transaction = transaction;
+
+	return transaction;
+}
+
+/*
+ * Handle management.
+ *
+ * A handle_t is an object which represents a single atomic update to a
+ * filesystem, and which tracks all of the modifications which form part
+ * of that one update.
+ */
+
+/*
+ * start_this_handle: Given a handle, deal with any locking or stalling
+ * needed to make sure that there is enough journal space for the handle
+ * to begin.  Attach the handle to a transaction and set up the
+ * transaction's buffer credits.
+ */
+
+static int start_this_handle(journal_t *journal, handle_t *handle)
+{
+	transaction_t *transaction;
+	int needed;
+	int nblocks = handle->h_buffer_credits;
+	transaction_t *new_transaction = NULL;
+	int ret = 0;
+
+	if (nblocks > journal->j_max_transaction_buffers) {
+		printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
+		       current->comm, nblocks,
+		       journal->j_max_transaction_buffers);
+		ret = -ENOSPC;
+		goto out;
+	}
+
+alloc_transaction:
+	if (!journal->j_running_transaction) {
+		new_transaction = jbd_kmalloc(sizeof(*new_transaction),
+						GFP_NOFS);
+		if (!new_transaction) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		memset(new_transaction, 0, sizeof(*new_transaction));
+	}
+
+	jbd_debug(3, "New handle %p going live.\n", handle);
+
+repeat:
+
+	/*
+	 * We need to hold j_state_lock until t_updates has been incremented,
+	 * for proper journal barrier handling
+	 */
+	spin_lock(&journal->j_state_lock);
+repeat_locked:
+	if (is_journal_aborted(journal) ||
+	    (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
+		spin_unlock(&journal->j_state_lock);
+		ret = -EROFS;
+		goto out;
+	}
+
+	/* Wait on the journal's transaction barrier if necessary */
+	if (journal->j_barrier_count) {
+		spin_unlock(&journal->j_state_lock);
+		wait_event(journal->j_wait_transaction_locked,
+				journal->j_barrier_count == 0);
+		goto repeat;
+	}
+
+	if (!journal->j_running_transaction) {
+		if (!new_transaction) {
+			spin_unlock(&journal->j_state_lock);
+			goto alloc_transaction;
+		}
+		jbd2_get_transaction(journal, new_transaction);
+		new_transaction = NULL;
+	}
+
+	transaction = journal->j_running_transaction;
+
+	/*
+	 * If the current transaction is locked down for commit, wait for the
+	 * lock to be released.
+	 */
+	if (transaction->t_state == T_LOCKED) {
+		DEFINE_WAIT(wait);
+
+		prepare_to_wait(&journal->j_wait_transaction_locked,
+					&wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&journal->j_state_lock);
+		schedule();
+		finish_wait(&journal->j_wait_transaction_locked, &wait);
+		goto repeat;
+	}
+
+	/*
+	 * If there is not enough space left in the log to write all potential
+	 * buffers requested by this operation, we need to stall pending a log
+	 * checkpoint to free some more log space.
+	 */
+	spin_lock(&transaction->t_handle_lock);
+	needed = transaction->t_outstanding_credits + nblocks;
+
+	if (needed > journal->j_max_transaction_buffers) {
+		/*
+		 * If the current transaction is already too large, then start
+		 * to commit it: we can then go back and attach this handle to
+		 * a new transaction.
+		 */
+		DEFINE_WAIT(wait);
+
+		jbd_debug(2, "Handle %p starting new commit...\n", handle);
+		spin_unlock(&transaction->t_handle_lock);
+		prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+				TASK_UNINTERRUPTIBLE);
+		__jbd2_log_start_commit(journal, transaction->t_tid);
+		spin_unlock(&journal->j_state_lock);
+		schedule();
+		finish_wait(&journal->j_wait_transaction_locked, &wait);
+		goto repeat;
+	}
+
+	/*
+	 * The commit code assumes that it can get enough log space
+	 * without forcing a checkpoint.  This is *critical* for
+	 * correctness: a checkpoint of a buffer which is also
+	 * associated with a committing transaction creates a deadlock,
+	 * so commit simply cannot force through checkpoints.
+	 *
+	 * We must therefore ensure the necessary space in the journal
+	 * *before* starting to dirty potentially checkpointed buffers
+	 * in the new transaction.
+	 *
+	 * The worst part is, any transaction currently committing can
+	 * reduce the free space arbitrarily.  Be careful to account for
+	 * those buffers when checkpointing.
+	 */
+
+	/*
+	 * @@@ AKPM: This seems rather over-defensive.  We're giving commit
+	 * a _lot_ of headroom: 1/4 of the journal plus the size of
+	 * the committing transaction.  Really, we only need to give it
+	 * committing_transaction->t_outstanding_credits plus "enough" for
+	 * the log control blocks.
+	 * Also, this test is inconsitent with the matching one in
+	 * jbd2_journal_extend().
+	 */
+	if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
+		jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
+		spin_unlock(&transaction->t_handle_lock);
+		__jbd2_log_wait_for_space(journal);
+		goto repeat_locked;
+	}
+
+	/* OK, account for the buffers that this operation expects to
+	 * use and add the handle to the running transaction. */
+
+	handle->h_transaction = transaction;
+	transaction->t_outstanding_credits += nblocks;
+	transaction->t_updates++;
+	transaction->t_handle_count++;
+	jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
+		  handle, nblocks, transaction->t_outstanding_credits,
+		  __jbd2_log_space_left(journal));
+	spin_unlock(&transaction->t_handle_lock);
+	spin_unlock(&journal->j_state_lock);
+out:
+	if (unlikely(new_transaction))		/* It's usually NULL */
+		kfree(new_transaction);
+	return ret;
+}
+
+/* Allocate a new handle.  This should probably be in a slab... */
+static handle_t *new_handle(int nblocks)
+{
+	handle_t *handle = jbd_alloc_handle(GFP_NOFS);
+	if (!handle)
+		return NULL;
+	memset(handle, 0, sizeof(*handle));
+	handle->h_buffer_credits = nblocks;
+	handle->h_ref = 1;
+
+	return handle;
+}
+
+/**
+ * handle_t *jbd2_journal_start() - Obtain a new handle.
+ * @journal: Journal to start transaction on.
+ * @nblocks: number of block buffer we might modify
+ *
+ * We make sure that the transaction can guarantee at least nblocks of
+ * modified buffers in the log.  We block until the log can guarantee
+ * that much space.
+ *
+ * This function is visible to journal users (like ext3fs), so is not
+ * called with the journal already locked.
+ *
+ * Return a pointer to a newly allocated handle, or NULL on failure
+ */
+handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
+{
+	handle_t *handle = journal_current_handle();
+	int err;
+
+	if (!journal)
+		return ERR_PTR(-EROFS);
+
+	if (handle) {
+		J_ASSERT(handle->h_transaction->t_journal == journal);
+		handle->h_ref++;
+		return handle;
+	}
+
+	handle = new_handle(nblocks);
+	if (!handle)
+		return ERR_PTR(-ENOMEM);
+
+	current->journal_info = handle;
+
+	err = start_this_handle(journal, handle);
+	if (err < 0) {
+		jbd_free_handle(handle);
+		current->journal_info = NULL;
+		handle = ERR_PTR(err);
+	}
+	return handle;
+}
+
+/**
+ * int jbd2_journal_extend() - extend buffer credits.
+ * @handle:  handle to 'extend'
+ * @nblocks: nr blocks to try to extend by.
+ *
+ * Some transactions, such as large extends and truncates, can be done
+ * atomically all at once or in several stages.  The operation requests
+ * a credit for a number of buffer modications in advance, but can
+ * extend its credit if it needs more.
+ *
+ * jbd2_journal_extend tries to give the running handle more buffer credits.
+ * It does not guarantee that allocation - this is a best-effort only.
+ * The calling process MUST be able to deal cleanly with a failure to
+ * extend here.
+ *
+ * Return 0 on success, non-zero on failure.
+ *
+ * return code < 0 implies an error
+ * return code > 0 implies normal transaction-full status.
+ */
+int jbd2_journal_extend(handle_t *handle, int nblocks)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	int result;
+	int wanted;
+
+	result = -EIO;
+	if (is_handle_aborted(handle))
+		goto out;
+
+	result = 1;
+
+	spin_lock(&journal->j_state_lock);
+
+	/* Don't extend a locked-down transaction! */
+	if (handle->h_transaction->t_state != T_RUNNING) {
+		jbd_debug(3, "denied handle %p %d blocks: "
+			  "transaction not running\n", handle, nblocks);
+		goto error_out;
+	}
+
+	spin_lock(&transaction->t_handle_lock);
+	wanted = transaction->t_outstanding_credits + nblocks;
+
+	if (wanted > journal->j_max_transaction_buffers) {
+		jbd_debug(3, "denied handle %p %d blocks: "
+			  "transaction too large\n", handle, nblocks);
+		goto unlock;
+	}
+
+	if (wanted > __jbd2_log_space_left(journal)) {
+		jbd_debug(3, "denied handle %p %d blocks: "
+			  "insufficient log space\n", handle, nblocks);
+		goto unlock;
+	}
+
+	handle->h_buffer_credits += nblocks;
+	transaction->t_outstanding_credits += nblocks;
+	result = 0;
+
+	jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
+unlock:
+	spin_unlock(&transaction->t_handle_lock);
+error_out:
+	spin_unlock(&journal->j_state_lock);
+out:
+	return result;
+}
+
+
+/**
+ * int jbd2_journal_restart() - restart a handle .
+ * @handle:  handle to restart
+ * @nblocks: nr credits requested
+ *
+ * Restart a handle for a multi-transaction filesystem
+ * operation.
+ *
+ * If the jbd2_journal_extend() call above fails to grant new buffer credits
+ * to a running handle, a call to jbd2_journal_restart will commit the
+ * handle's transaction so far and reattach the handle to a new
+ * transaction capabable of guaranteeing the requested number of
+ * credits.
+ */
+
+int jbd2_journal_restart(handle_t *handle, int nblocks)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	int ret;
+
+	/* If we've had an abort of any type, don't even think about
+	 * actually doing the restart! */
+	if (is_handle_aborted(handle))
+		return 0;
+
+	/*
+	 * First unlink the handle from its current transaction, and start the
+	 * commit on that.
+	 */
+	J_ASSERT(transaction->t_updates > 0);
+	J_ASSERT(journal_current_handle() == handle);
+
+	spin_lock(&journal->j_state_lock);
+	spin_lock(&transaction->t_handle_lock);
+	transaction->t_outstanding_credits -= handle->h_buffer_credits;
+	transaction->t_updates--;
+
+	if (!transaction->t_updates)
+		wake_up(&journal->j_wait_updates);
+	spin_unlock(&transaction->t_handle_lock);
+
+	jbd_debug(2, "restarting handle %p\n", handle);
+	__jbd2_log_start_commit(journal, transaction->t_tid);
+	spin_unlock(&journal->j_state_lock);
+
+	handle->h_buffer_credits = nblocks;
+	ret = start_this_handle(journal, handle);
+	return ret;
+}
+
+
+/**
+ * void jbd2_journal_lock_updates () - establish a transaction barrier.
+ * @journal:  Journal to establish a barrier on.
+ *
+ * This locks out any further updates from being started, and blocks
+ * until all existing updates have completed, returning only once the
+ * journal is in a quiescent state with no updates running.
+ *
+ * The journal lock should not be held on entry.
+ */
+void jbd2_journal_lock_updates(journal_t *journal)
+{
+	DEFINE_WAIT(wait);
+
+	spin_lock(&journal->j_state_lock);
+	++journal->j_barrier_count;
+
+	/* Wait until there are no running updates */
+	while (1) {
+		transaction_t *transaction = journal->j_running_transaction;
+
+		if (!transaction)
+			break;
+
+		spin_lock(&transaction->t_handle_lock);
+		if (!transaction->t_updates) {
+			spin_unlock(&transaction->t_handle_lock);
+			break;
+		}
+		prepare_to_wait(&journal->j_wait_updates, &wait,
+				TASK_UNINTERRUPTIBLE);
+		spin_unlock(&transaction->t_handle_lock);
+		spin_unlock(&journal->j_state_lock);
+		schedule();
+		finish_wait(&journal->j_wait_updates, &wait);
+		spin_lock(&journal->j_state_lock);
+	}
+	spin_unlock(&journal->j_state_lock);
+
+	/*
+	 * We have now established a barrier against other normal updates, but
+	 * we also need to barrier against other jbd2_journal_lock_updates() calls
+	 * to make sure that we serialise special journal-locked operations
+	 * too.
+	 */
+	mutex_lock(&journal->j_barrier);
+}
+
+/**
+ * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier
+ * @journal:  Journal to release the barrier on.
+ *
+ * Release a transaction barrier obtained with jbd2_journal_lock_updates().
+ *
+ * Should be called without the journal lock held.
+ */
+void jbd2_journal_unlock_updates (journal_t *journal)
+{
+	J_ASSERT(journal->j_barrier_count != 0);
+
+	mutex_unlock(&journal->j_barrier);
+	spin_lock(&journal->j_state_lock);
+	--journal->j_barrier_count;
+	spin_unlock(&journal->j_state_lock);
+	wake_up(&journal->j_wait_transaction_locked);
+}
+
+/*
+ * Report any unexpected dirty buffers which turn up.  Normally those
+ * indicate an error, but they can occur if the user is running (say)
+ * tune2fs to modify the live filesystem, so we need the option of
+ * continuing as gracefully as possible.  #
+ *
+ * The caller should already hold the journal lock and
+ * j_list_lock spinlock: most callers will need those anyway
+ * in order to probe the buffer's journaling state safely.
+ */
+static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
+{
+	int jlist;
+
+	/* If this buffer is one which might reasonably be dirty
+	 * --- ie. data, or not part of this journal --- then
+	 * we're OK to leave it alone, but otherwise we need to
+	 * move the dirty bit to the journal's own internal
+	 * JBDDirty bit. */
+	jlist = jh->b_jlist;
+
+	if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
+	    jlist == BJ_Shadow || jlist == BJ_Forget) {
+		struct buffer_head *bh = jh2bh(jh);
+
+		if (test_clear_buffer_dirty(bh))
+			set_buffer_jbddirty(bh);
+	}
+}
+
+/*
+ * If the buffer is already part of the current transaction, then there
+ * is nothing we need to do.  If it is already part of a prior
+ * transaction which we are still committing to disk, then we need to
+ * make sure that we do not overwrite the old copy: we do copy-out to
+ * preserve the copy going to disk.  We also account the buffer against
+ * the handle's metadata buffer credits (unless the buffer is already
+ * part of the transaction, that is).
+ *
+ */
+static int
+do_get_write_access(handle_t *handle, struct journal_head *jh,
+			int force_copy)
+{
+	struct buffer_head *bh;
+	transaction_t *transaction;
+	journal_t *journal;
+	int error;
+	char *frozen_buffer = NULL;
+	int need_copy = 0;
+
+	if (is_handle_aborted(handle))
+		return -EROFS;
+
+	transaction = handle->h_transaction;
+	journal = transaction->t_journal;
+
+	jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
+
+	JBUFFER_TRACE(jh, "entry");
+repeat:
+	bh = jh2bh(jh);
+
+	/* @@@ Need to check for errors here at some point. */
+
+	lock_buffer(bh);
+	jbd_lock_bh_state(bh);
+
+	/* We now hold the buffer lock so it is safe to query the buffer
+	 * state.  Is the buffer dirty?
+	 *
+	 * If so, there are two possibilities.  The buffer may be
+	 * non-journaled, and undergoing a quite legitimate writeback.
+	 * Otherwise, it is journaled, and we don't expect dirty buffers
+	 * in that state (the buffers should be marked JBD_Dirty
+	 * instead.)  So either the IO is being done under our own
+	 * control and this is a bug, or it's a third party IO such as
+	 * dump(8) (which may leave the buffer scheduled for read ---
+	 * ie. locked but not dirty) or tune2fs (which may actually have
+	 * the buffer dirtied, ugh.)  */
+
+	if (buffer_dirty(bh)) {
+		/*
+		 * First question: is this buffer already part of the current
+		 * transaction or the existing committing transaction?
+		 */
+		if (jh->b_transaction) {
+			J_ASSERT_JH(jh,
+				jh->b_transaction == transaction ||
+				jh->b_transaction ==
+					journal->j_committing_transaction);
+			if (jh->b_next_transaction)
+				J_ASSERT_JH(jh, jh->b_next_transaction ==
+							transaction);
+		}
+		/*
+		 * In any case we need to clean the dirty flag and we must
+		 * do it under the buffer lock to be sure we don't race
+		 * with running write-out.
+		 */
+		JBUFFER_TRACE(jh, "Unexpected dirty buffer");
+		jbd_unexpected_dirty_buffer(jh);
+	}
+
+	unlock_buffer(bh);
+
+	error = -EROFS;
+	if (is_handle_aborted(handle)) {
+		jbd_unlock_bh_state(bh);
+		goto out;
+	}
+	error = 0;
+
+	/*
+	 * The buffer is already part of this transaction if b_transaction or
+	 * b_next_transaction points to it
+	 */
+	if (jh->b_transaction == transaction ||
+	    jh->b_next_transaction == transaction)
+		goto done;
+
+	/*
+	 * If there is already a copy-out version of this buffer, then we don't
+	 * need to make another one
+	 */
+	if (jh->b_frozen_data) {
+		JBUFFER_TRACE(jh, "has frozen data");
+		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+		jh->b_next_transaction = transaction;
+		goto done;
+	}
+
+	/* Is there data here we need to preserve? */
+
+	if (jh->b_transaction && jh->b_transaction != transaction) {
+		JBUFFER_TRACE(jh, "owned by older transaction");
+		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+		J_ASSERT_JH(jh, jh->b_transaction ==
+					journal->j_committing_transaction);
+
+		/* There is one case we have to be very careful about.
+		 * If the committing transaction is currently writing
+		 * this buffer out to disk and has NOT made a copy-out,
+		 * then we cannot modify the buffer contents at all
+		 * right now.  The essence of copy-out is that it is the
+		 * extra copy, not the primary copy, which gets
+		 * journaled.  If the primary copy is already going to
+		 * disk then we cannot do copy-out here. */
+
+		if (jh->b_jlist == BJ_Shadow) {
+			DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
+			wait_queue_head_t *wqh;
+
+			wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
+
+			JBUFFER_TRACE(jh, "on shadow: sleep");
+			jbd_unlock_bh_state(bh);
+			/* commit wakes up all shadow buffers after IO */
+			for ( ; ; ) {
+				prepare_to_wait(wqh, &wait.wait,
+						TASK_UNINTERRUPTIBLE);
+				if (jh->b_jlist != BJ_Shadow)
+					break;
+				schedule();
+			}
+			finish_wait(wqh, &wait.wait);
+			goto repeat;
+		}
+
+		/* Only do the copy if the currently-owning transaction
+		 * still needs it.  If it is on the Forget list, the
+		 * committing transaction is past that stage.  The
+		 * buffer had better remain locked during the kmalloc,
+		 * but that should be true --- we hold the journal lock
+		 * still and the buffer is already on the BUF_JOURNAL
+		 * list so won't be flushed.
+		 *
+		 * Subtle point, though: if this is a get_undo_access,
+		 * then we will be relying on the frozen_data to contain
+		 * the new value of the committed_data record after the
+		 * transaction, so we HAVE to force the frozen_data copy
+		 * in that case. */
+
+		if (jh->b_jlist != BJ_Forget || force_copy) {
+			JBUFFER_TRACE(jh, "generate frozen data");
+			if (!frozen_buffer) {
+				JBUFFER_TRACE(jh, "allocate memory for buffer");
+				jbd_unlock_bh_state(bh);
+				frozen_buffer =
+					jbd2_slab_alloc(jh2bh(jh)->b_size,
+							 GFP_NOFS);
+				if (!frozen_buffer) {
+					printk(KERN_EMERG
+					       "%s: OOM for frozen_buffer\n",
+					       __FUNCTION__);
+					JBUFFER_TRACE(jh, "oom!");
+					error = -ENOMEM;
+					jbd_lock_bh_state(bh);
+					goto done;
+				}
+				goto repeat;
+			}
+			jh->b_frozen_data = frozen_buffer;
+			frozen_buffer = NULL;
+			need_copy = 1;
+		}
+		jh->b_next_transaction = transaction;
+	}
+
+
+	/*
+	 * Finally, if the buffer is not journaled right now, we need to make
+	 * sure it doesn't get written to disk before the caller actually
+	 * commits the new data
+	 */
+	if (!jh->b_transaction) {
+		JBUFFER_TRACE(jh, "no transaction");
+		J_ASSERT_JH(jh, !jh->b_next_transaction);
+		jh->b_transaction = transaction;
+		JBUFFER_TRACE(jh, "file as BJ_Reserved");
+		spin_lock(&journal->j_list_lock);
+		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
+		spin_unlock(&journal->j_list_lock);
+	}
+
+done:
+	if (need_copy) {
+		struct page *page;
+		int offset;
+		char *source;
+
+		J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
+			    "Possible IO failure.\n");
+		page = jh2bh(jh)->b_page;
+		offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
+		source = kmap_atomic(page, KM_USER0);
+		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
+		kunmap_atomic(source, KM_USER0);
+	}
+	jbd_unlock_bh_state(bh);
+
+	/*
+	 * If we are about to journal a buffer, then any revoke pending on it is
+	 * no longer valid
+	 */
+	jbd2_journal_cancel_revoke(handle, jh);
+
+out:
+	if (unlikely(frozen_buffer))	/* It's usually NULL */
+		jbd2_slab_free(frozen_buffer, bh->b_size);
+
+	JBUFFER_TRACE(jh, "exit");
+	return error;
+}
+
+/**
+ * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
+ * @handle: transaction to add buffer modifications to
+ * @bh:     bh to be used for metadata writes
+ * @credits: variable that will receive credits for the buffer
+ *
+ * Returns an error code or 0 on success.
+ *
+ * In full data journalling mode the buffer may be of type BJ_AsyncData,
+ * because we're write()ing a buffer which is also part of a shared mapping.
+ */
+
+int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
+{
+	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+	int rc;
+
+	/* We do not want to get caught playing with fields which the
+	 * log thread also manipulates.  Make sure that the buffer
+	 * completes any outstanding IO before proceeding. */
+	rc = do_get_write_access(handle, jh, 0);
+	jbd2_journal_put_journal_head(jh);
+	return rc;
+}
+
+
+/*
+ * When the user wants to journal a newly created buffer_head
+ * (ie. getblk() returned a new buffer and we are going to populate it
+ * manually rather than reading off disk), then we need to keep the
+ * buffer_head locked until it has been completely filled with new
+ * data.  In this case, we should be able to make the assertion that
+ * the bh is not already part of an existing transaction.
+ *
+ * The buffer should already be locked by the caller by this point.
+ * There is no lock ranking violation: it was a newly created,
+ * unlocked buffer beforehand. */
+
+/**
+ * int jbd2_journal_get_create_access () - notify intent to use newly created bh
+ * @handle: transaction to new buffer to
+ * @bh: new buffer.
+ *
+ * Call this if you create a new bh.
+ */
+int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+	int err;
+
+	jbd_debug(5, "journal_head %p\n", jh);
+	err = -EROFS;
+	if (is_handle_aborted(handle))
+		goto out;
+	err = 0;
+
+	JBUFFER_TRACE(jh, "entry");
+	/*
+	 * The buffer may already belong to this transaction due to pre-zeroing
+	 * in the filesystem's new_block code.  It may also be on the previous,
+	 * committing transaction's lists, but it HAS to be in Forget state in
+	 * that case: the transaction must have deleted the buffer for it to be
+	 * reused here.
+	 */
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+	J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
+		jh->b_transaction == NULL ||
+		(jh->b_transaction == journal->j_committing_transaction &&
+			  jh->b_jlist == BJ_Forget)));
+
+	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
+
+	if (jh->b_transaction == NULL) {
+		jh->b_transaction = transaction;
+		JBUFFER_TRACE(jh, "file as BJ_Reserved");
+		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
+	} else if (jh->b_transaction == journal->j_committing_transaction) {
+		JBUFFER_TRACE(jh, "set next transaction");
+		jh->b_next_transaction = transaction;
+	}
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+
+	/*
+	 * akpm: I added this.  ext3_alloc_branch can pick up new indirect
+	 * blocks which contain freed but then revoked metadata.  We need
+	 * to cancel the revoke in case we end up freeing it yet again
+	 * and the reallocating as data - this would cause a second revoke,
+	 * which hits an assertion error.
+	 */
+	JBUFFER_TRACE(jh, "cancelling revoke");
+	jbd2_journal_cancel_revoke(handle, jh);
+	jbd2_journal_put_journal_head(jh);
+out:
+	return err;
+}
+
+/**
+ * int jbd2_journal_get_undo_access() -  Notify intent to modify metadata with
+ *     non-rewindable consequences
+ * @handle: transaction
+ * @bh: buffer to undo
+ * @credits: store the number of taken credits here (if not NULL)
+ *
+ * Sometimes there is a need to distinguish between metadata which has
+ * been committed to disk and that which has not.  The ext3fs code uses
+ * this for freeing and allocating space, we have to make sure that we
+ * do not reuse freed space until the deallocation has been committed,
+ * since if we overwrote that space we would make the delete
+ * un-rewindable in case of a crash.
+ *
+ * To deal with that, jbd2_journal_get_undo_access requests write access to a
+ * buffer for parts of non-rewindable operations such as delete
+ * operations on the bitmaps.  The journaling code must keep a copy of
+ * the buffer's contents prior to the undo_access call until such time
+ * as we know that the buffer has definitely been committed to disk.
+ *
+ * We never need to know which transaction the committed data is part
+ * of, buffers touched here are guaranteed to be dirtied later and so
+ * will be committed to a new transaction in due course, at which point
+ * we can discard the old committed data pointer.
+ *
+ * Returns error number or 0 on success.
+ */
+int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
+{
+	int err;
+	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+	char *committed_data = NULL;
+
+	JBUFFER_TRACE(jh, "entry");
+
+	/*
+	 * Do this first --- it can drop the journal lock, so we want to
+	 * make sure that obtaining the committed_data is done
+	 * atomically wrt. completion of any outstanding commits.
+	 */
+	err = do_get_write_access(handle, jh, 1);
+	if (err)
+		goto out;
+
+repeat:
+	if (!jh->b_committed_data) {
+		committed_data = jbd2_slab_alloc(jh2bh(jh)->b_size, GFP_NOFS);
+		if (!committed_data) {
+			printk(KERN_EMERG "%s: No memory for committed data\n",
+				__FUNCTION__);
+			err = -ENOMEM;
+			goto out;
+		}
+	}
+
+	jbd_lock_bh_state(bh);
+	if (!jh->b_committed_data) {
+		/* Copy out the current buffer contents into the
+		 * preserved, committed copy. */
+		JBUFFER_TRACE(jh, "generate b_committed data");
+		if (!committed_data) {
+			jbd_unlock_bh_state(bh);
+			goto repeat;
+		}
+
+		jh->b_committed_data = committed_data;
+		committed_data = NULL;
+		memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
+	}
+	jbd_unlock_bh_state(bh);
+out:
+	jbd2_journal_put_journal_head(jh);
+	if (unlikely(committed_data))
+		jbd2_slab_free(committed_data, bh->b_size);
+	return err;
+}
+
+/**
+ * int jbd2_journal_dirty_data() -  mark a buffer as containing dirty data which
+ *                             needs to be flushed before we can commit the
+ *                             current transaction.
+ * @handle: transaction
+ * @bh: bufferhead to mark
+ *
+ * The buffer is placed on the transaction's data list and is marked as
+ * belonging to the transaction.
+ *
+ * Returns error number or 0 on success.
+ *
+ * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
+ * by kswapd.
+ */
+int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
+{
+	journal_t *journal = handle->h_transaction->t_journal;
+	int need_brelse = 0;
+	struct journal_head *jh;
+
+	if (is_handle_aborted(handle))
+		return 0;
+
+	jh = jbd2_journal_add_journal_head(bh);
+	JBUFFER_TRACE(jh, "entry");
+
+	/*
+	 * The buffer could *already* be dirty.  Writeout can start
+	 * at any time.
+	 */
+	jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
+
+	/*
+	 * What if the buffer is already part of a running transaction?
+	 *
+	 * There are two cases:
+	 * 1) It is part of the current running transaction.  Refile it,
+	 *    just in case we have allocated it as metadata, deallocated
+	 *    it, then reallocated it as data.
+	 * 2) It is part of the previous, still-committing transaction.
+	 *    If all we want to do is to guarantee that the buffer will be
+	 *    written to disk before this new transaction commits, then
+	 *    being sure that the *previous* transaction has this same
+	 *    property is sufficient for us!  Just leave it on its old
+	 *    transaction.
+	 *
+	 * In case (2), the buffer must not already exist as metadata
+	 * --- that would violate write ordering (a transaction is free
+	 * to write its data at any point, even before the previous
+	 * committing transaction has committed).  The caller must
+	 * never, ever allow this to happen: there's nothing we can do
+	 * about it in this layer.
+	 */
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+	if (jh->b_transaction) {
+		JBUFFER_TRACE(jh, "has transaction");
+		if (jh->b_transaction != handle->h_transaction) {
+			JBUFFER_TRACE(jh, "belongs to older transaction");
+			J_ASSERT_JH(jh, jh->b_transaction ==
+					journal->j_committing_transaction);
+
+			/* @@@ IS THIS TRUE  ? */
+			/*
+			 * Not any more.  Scenario: someone does a write()
+			 * in data=journal mode.  The buffer's transaction has
+			 * moved into commit.  Then someone does another
+			 * write() to the file.  We do the frozen data copyout
+			 * and set b_next_transaction to point to j_running_t.
+			 * And while we're in that state, someone does a
+			 * writepage() in an attempt to pageout the same area
+			 * of the file via a shared mapping.  At present that
+			 * calls jbd2_journal_dirty_data(), and we get right here.
+			 * It may be too late to journal the data.  Simply
+			 * falling through to the next test will suffice: the
+			 * data will be dirty and wil be checkpointed.  The
+			 * ordering comments in the next comment block still
+			 * apply.
+			 */
+			//J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+
+			/*
+			 * If we're journalling data, and this buffer was
+			 * subject to a write(), it could be metadata, forget
+			 * or shadow against the committing transaction.  Now,
+			 * someone has dirtied the same darn page via a mapping
+			 * and it is being writepage()'d.
+			 * We *could* just steal the page from commit, with some
+			 * fancy locking there.  Instead, we just skip it -
+			 * don't tie the page's buffers to the new transaction
+			 * at all.
+			 * Implication: if we crash before the writepage() data
+			 * is written into the filesystem, recovery will replay
+			 * the write() data.
+			 */
+			if (jh->b_jlist != BJ_None &&
+					jh->b_jlist != BJ_SyncData &&
+					jh->b_jlist != BJ_Locked) {
+				JBUFFER_TRACE(jh, "Not stealing");
+				goto no_journal;
+			}
+
+			/*
+			 * This buffer may be undergoing writeout in commit.  We
+			 * can't return from here and let the caller dirty it
+			 * again because that can cause the write-out loop in
+			 * commit to never terminate.
+			 */
+			if (buffer_dirty(bh)) {
+				get_bh(bh);
+				spin_unlock(&journal->j_list_lock);
+				jbd_unlock_bh_state(bh);
+				need_brelse = 1;
+				sync_dirty_buffer(bh);
+				jbd_lock_bh_state(bh);
+				spin_lock(&journal->j_list_lock);
+				/* The buffer may become locked again at any
+				   time if it is redirtied */
+			}
+
+			/* journal_clean_data_list() may have got there first */
+			if (jh->b_transaction != NULL) {
+				JBUFFER_TRACE(jh, "unfile from commit");
+				__jbd2_journal_temp_unlink_buffer(jh);
+				/* It still points to the committing
+				 * transaction; move it to this one so
+				 * that the refile assert checks are
+				 * happy. */
+				jh->b_transaction = handle->h_transaction;
+			}
+			/* The buffer will be refiled below */
+
+		}
+		/*
+		 * Special case --- the buffer might actually have been
+		 * allocated and then immediately deallocated in the previous,
+		 * committing transaction, so might still be left on that
+		 * transaction's metadata lists.
+		 */
+		if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
+			JBUFFER_TRACE(jh, "not on correct data list: unfile");
+			J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
+			__jbd2_journal_temp_unlink_buffer(jh);
+			jh->b_transaction = handle->h_transaction;
+			JBUFFER_TRACE(jh, "file as data");
+			__jbd2_journal_file_buffer(jh, handle->h_transaction,
+						BJ_SyncData);
+		}
+	} else {
+		JBUFFER_TRACE(jh, "not on a transaction");
+		__jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
+	}
+no_journal:
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+	if (need_brelse) {
+		BUFFER_TRACE(bh, "brelse");
+		__brelse(bh);
+	}
+	JBUFFER_TRACE(jh, "exit");
+	jbd2_journal_put_journal_head(jh);
+	return 0;
+}
+
+/**
+ * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
+ * @handle: transaction to add buffer to.
+ * @bh: buffer to mark
+ *
+ * mark dirty metadata which needs to be journaled as part of the current
+ * transaction.
+ *
+ * The buffer is placed on the transaction's metadata list and is marked
+ * as belonging to the transaction.
+ *
+ * Returns error number or 0 on success.
+ *
+ * Special care needs to be taken if the buffer already belongs to the
+ * current committing transaction (in which case we should have frozen
+ * data present for that commit).  In that case, we don't relink the
+ * buffer: that only gets done when the old transaction finally
+ * completes its commit.
+ */
+int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	struct journal_head *jh = bh2jh(bh);
+
+	jbd_debug(5, "journal_head %p\n", jh);
+	JBUFFER_TRACE(jh, "entry");
+	if (is_handle_aborted(handle))
+		goto out;
+
+	jbd_lock_bh_state(bh);
+
+	if (jh->b_modified == 0) {
+		/*
+		 * This buffer's got modified and becoming part
+		 * of the transaction. This needs to be done
+		 * once a transaction -bzzz
+		 */
+		jh->b_modified = 1;
+		J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+		handle->h_buffer_credits--;
+	}
+
+	/*
+	 * fastpath, to avoid expensive locking.  If this buffer is already
+	 * on the running transaction's metadata list there is nothing to do.
+	 * Nobody can take it off again because there is a handle open.
+	 * I _think_ we're OK here with SMP barriers - a mistaken decision will
+	 * result in this test being false, so we go in and take the locks.
+	 */
+	if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
+		JBUFFER_TRACE(jh, "fastpath");
+		J_ASSERT_JH(jh, jh->b_transaction ==
+					journal->j_running_transaction);
+		goto out_unlock_bh;
+	}
+
+	set_buffer_jbddirty(bh);
+
+	/*
+	 * Metadata already on the current transaction list doesn't
+	 * need to be filed.  Metadata on another transaction's list must
+	 * be committing, and will be refiled once the commit completes:
+	 * leave it alone for now.
+	 */
+	if (jh->b_transaction != transaction) {
+		JBUFFER_TRACE(jh, "already on other transaction");
+		J_ASSERT_JH(jh, jh->b_transaction ==
+					journal->j_committing_transaction);
+		J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
+		/* And this case is illegal: we can't reuse another
+		 * transaction's data buffer, ever. */
+		goto out_unlock_bh;
+	}
+
+	/* That test should have eliminated the following case: */
+	J_ASSERT_JH(jh, jh->b_frozen_data == 0);
+
+	JBUFFER_TRACE(jh, "file as BJ_Metadata");
+	spin_lock(&journal->j_list_lock);
+	__jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
+	spin_unlock(&journal->j_list_lock);
+out_unlock_bh:
+	jbd_unlock_bh_state(bh);
+out:
+	JBUFFER_TRACE(jh, "exit");
+	return 0;
+}
+
+/*
+ * jbd2_journal_release_buffer: undo a get_write_access without any buffer
+ * updates, if the update decided in the end that it didn't need access.
+ *
+ */
+void
+jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
+{
+	BUFFER_TRACE(bh, "entry");
+}
+
+/**
+ * void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
+ * @handle: transaction handle
+ * @bh:     bh to 'forget'
+ *
+ * We can only do the bforget if there are no commits pending against the
+ * buffer.  If the buffer is dirty in the current running transaction we
+ * can safely unlink it.
+ *
+ * bh may not be a journalled buffer at all - it may be a non-JBD
+ * buffer which came off the hashtable.  Check for this.
+ *
+ * Decrements bh->b_count by one.
+ *
+ * Allow this call even if the handle has aborted --- it may be part of
+ * the caller's cleanup after an abort.
+ */
+int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	struct journal_head *jh;
+	int drop_reserve = 0;
+	int err = 0;
+
+	BUFFER_TRACE(bh, "entry");
+
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+
+	if (!buffer_jbd(bh))
+		goto not_jbd;
+	jh = bh2jh(bh);
+
+	/* Critical error: attempting to delete a bitmap buffer, maybe?
+	 * Don't do any jbd operations, and return an error. */
+	if (!J_EXPECT_JH(jh, !jh->b_committed_data,
+			 "inconsistent data on disk")) {
+		err = -EIO;
+		goto not_jbd;
+	}
+
+	/*
+	 * The buffer's going from the transaction, we must drop
+	 * all references -bzzz
+	 */
+	jh->b_modified = 0;
+
+	if (jh->b_transaction == handle->h_transaction) {
+		J_ASSERT_JH(jh, !jh->b_frozen_data);
+
+		/* If we are forgetting a buffer which is already part
+		 * of this transaction, then we can just drop it from
+		 * the transaction immediately. */
+		clear_buffer_dirty(bh);
+		clear_buffer_jbddirty(bh);
+
+		JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
+
+		drop_reserve = 1;
+
+		/*
+		 * We are no longer going to journal this buffer.
+		 * However, the commit of this transaction is still
+		 * important to the buffer: the delete that we are now
+		 * processing might obsolete an old log entry, so by
+		 * committing, we can satisfy the buffer's checkpoint.
+		 *
+		 * So, if we have a checkpoint on the buffer, we should
+		 * now refile the buffer on our BJ_Forget list so that
+		 * we know to remove the checkpoint after we commit.
+		 */
+
+		if (jh->b_cp_transaction) {
+			__jbd2_journal_temp_unlink_buffer(jh);
+			__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
+		} else {
+			__jbd2_journal_unfile_buffer(jh);
+			jbd2_journal_remove_journal_head(bh);
+			__brelse(bh);
+			if (!buffer_jbd(bh)) {
+				spin_unlock(&journal->j_list_lock);
+				jbd_unlock_bh_state(bh);
+				__bforget(bh);
+				goto drop;
+			}
+		}
+	} else if (jh->b_transaction) {
+		J_ASSERT_JH(jh, (jh->b_transaction ==
+				 journal->j_committing_transaction));
+		/* However, if the buffer is still owned by a prior
+		 * (committing) transaction, we can't drop it yet... */
+		JBUFFER_TRACE(jh, "belongs to older transaction");
+		/* ... but we CAN drop it from the new transaction if we
+		 * have also modified it since the original commit. */
+
+		if (jh->b_next_transaction) {
+			J_ASSERT(jh->b_next_transaction == transaction);
+			jh->b_next_transaction = NULL;
+			drop_reserve = 1;
+		}
+	}
+
+not_jbd:
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+	__brelse(bh);
+drop:
+	if (drop_reserve) {
+		/* no need to reserve log space for this block -bzzz */
+		handle->h_buffer_credits++;
+	}
+	return err;
+}
+
+/**
+ * int jbd2_journal_stop() - complete a transaction
+ * @handle: tranaction to complete.
+ *
+ * All done for a particular handle.
+ *
+ * There is not much action needed here.  We just return any remaining
+ * buffer credits to the transaction and remove the handle.  The only
+ * complication is that we need to start a commit operation if the
+ * filesystem is marked for synchronous update.
+ *
+ * jbd2_journal_stop itself will not usually return an error, but it may
+ * do so in unusual circumstances.  In particular, expect it to
+ * return -EIO if a jbd2_journal_abort has been executed since the
+ * transaction began.
+ */
+int jbd2_journal_stop(handle_t *handle)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+	int old_handle_count, err;
+	pid_t pid;
+
+	J_ASSERT(journal_current_handle() == handle);
+
+	if (is_handle_aborted(handle))
+		err = -EIO;
+	else {
+		J_ASSERT(transaction->t_updates > 0);
+		err = 0;
+	}
+
+	if (--handle->h_ref > 0) {
+		jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+			  handle->h_ref);
+		return err;
+	}
+
+	jbd_debug(4, "Handle %p going down\n", handle);
+
+	/*
+	 * Implement synchronous transaction batching.  If the handle
+	 * was synchronous, don't force a commit immediately.  Let's
+	 * yield and let another thread piggyback onto this transaction.
+	 * Keep doing that while new threads continue to arrive.
+	 * It doesn't cost much - we're about to run a commit and sleep
+	 * on IO anyway.  Speeds up many-threaded, many-dir operations
+	 * by 30x or more...
+	 *
+	 * But don't do this if this process was the most recent one to
+	 * perform a synchronous write.  We do this to detect the case where a
+	 * single process is doing a stream of sync writes.  No point in waiting
+	 * for joiners in that case.
+	 */
+	pid = current->pid;
+	if (handle->h_sync && journal->j_last_sync_writer != pid) {
+		journal->j_last_sync_writer = pid;
+		do {
+			old_handle_count = transaction->t_handle_count;
+			schedule_timeout_uninterruptible(1);
+		} while (old_handle_count != transaction->t_handle_count);
+	}
+
+	current->journal_info = NULL;
+	spin_lock(&journal->j_state_lock);
+	spin_lock(&transaction->t_handle_lock);
+	transaction->t_outstanding_credits -= handle->h_buffer_credits;
+	transaction->t_updates--;
+	if (!transaction->t_updates) {
+		wake_up(&journal->j_wait_updates);
+		if (journal->j_barrier_count)
+			wake_up(&journal->j_wait_transaction_locked);
+	}
+
+	/*
+	 * If the handle is marked SYNC, we need to set another commit
+	 * going!  We also want to force a commit if the current
+	 * transaction is occupying too much of the log, or if the
+	 * transaction is too old now.
+	 */
+	if (handle->h_sync ||
+			transaction->t_outstanding_credits >
+				journal->j_max_transaction_buffers ||
+			time_after_eq(jiffies, transaction->t_expires)) {
+		/* Do this even for aborted journals: an abort still
+		 * completes the commit thread, it just doesn't write
+		 * anything to disk. */
+		tid_t tid = transaction->t_tid;
+
+		spin_unlock(&transaction->t_handle_lock);
+		jbd_debug(2, "transaction too old, requesting commit for "
+					"handle %p\n", handle);
+		/* This is non-blocking */
+		__jbd2_log_start_commit(journal, transaction->t_tid);
+		spin_unlock(&journal->j_state_lock);
+
+		/*
+		 * Special case: JBD2_SYNC synchronous updates require us
+		 * to wait for the commit to complete.
+		 */
+		if (handle->h_sync && !(current->flags & PF_MEMALLOC))
+			err = jbd2_log_wait_commit(journal, tid);
+	} else {
+		spin_unlock(&transaction->t_handle_lock);
+		spin_unlock(&journal->j_state_lock);
+	}
+
+	jbd_free_handle(handle);
+	return err;
+}
+
+/**int jbd2_journal_force_commit() - force any uncommitted transactions
+ * @journal: journal to force
+ *
+ * For synchronous operations: force any uncommitted transactions
+ * to disk.  May seem kludgy, but it reuses all the handle batching
+ * code in a very simple manner.
+ */
+int jbd2_journal_force_commit(journal_t *journal)
+{
+	handle_t *handle;
+	int ret;
+
+	handle = jbd2_journal_start(journal, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+	} else {
+		handle->h_sync = 1;
+		ret = jbd2_journal_stop(handle);
+	}
+	return ret;
+}
+
+/*
+ *
+ * List management code snippets: various functions for manipulating the
+ * transaction buffer lists.
+ *
+ */
+
+/*
+ * Append a buffer to a transaction list, given the transaction's list head
+ * pointer.
+ *
+ * j_list_lock is held.
+ *
+ * jbd_lock_bh_state(jh2bh(jh)) is held.
+ */
+
+static inline void
+__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
+{
+	if (!*list) {
+		jh->b_tnext = jh->b_tprev = jh;
+		*list = jh;
+	} else {
+		/* Insert at the tail of the list to preserve order */
+		struct journal_head *first = *list, *last = first->b_tprev;
+		jh->b_tprev = last;
+		jh->b_tnext = first;
+		last->b_tnext = first->b_tprev = jh;
+	}
+}
+
+/*
+ * Remove a buffer from a transaction list, given the transaction's list
+ * head pointer.
+ *
+ * Called with j_list_lock held, and the journal may not be locked.
+ *
+ * jbd_lock_bh_state(jh2bh(jh)) is held.
+ */
+
+static inline void
+__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
+{
+	if (*list == jh) {
+		*list = jh->b_tnext;
+		if (*list == jh)
+			*list = NULL;
+	}
+	jh->b_tprev->b_tnext = jh->b_tnext;
+	jh->b_tnext->b_tprev = jh->b_tprev;
+}
+
+/*
+ * Remove a buffer from the appropriate transaction list.
+ *
+ * Note that this function can *change* the value of
+ * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
+ * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list.  If the caller
+ * is holding onto a copy of one of thee pointers, it could go bad.
+ * Generally the caller needs to re-read the pointer from the transaction_t.
+ *
+ * Called under j_list_lock.  The journal may not be locked.
+ */
+void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
+{
+	struct journal_head **list = NULL;
+	transaction_t *transaction;
+	struct buffer_head *bh = jh2bh(jh);
+
+	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	transaction = jh->b_transaction;
+	if (transaction)
+		assert_spin_locked(&transaction->t_journal->j_list_lock);
+
+	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+	if (jh->b_jlist != BJ_None)
+		J_ASSERT_JH(jh, transaction != 0);
+
+	switch (jh->b_jlist) {
+	case BJ_None:
+		return;
+	case BJ_SyncData:
+		list = &transaction->t_sync_datalist;
+		break;
+	case BJ_Metadata:
+		transaction->t_nr_buffers--;
+		J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
+		list = &transaction->t_buffers;
+		break;
+	case BJ_Forget:
+		list = &transaction->t_forget;
+		break;
+	case BJ_IO:
+		list = &transaction->t_iobuf_list;
+		break;
+	case BJ_Shadow:
+		list = &transaction->t_shadow_list;
+		break;
+	case BJ_LogCtl:
+		list = &transaction->t_log_list;
+		break;
+	case BJ_Reserved:
+		list = &transaction->t_reserved_list;
+		break;
+	case BJ_Locked:
+		list = &transaction->t_locked_list;
+		break;
+	}
+
+	__blist_del_buffer(list, jh);
+	jh->b_jlist = BJ_None;
+	if (test_clear_buffer_jbddirty(bh))
+		mark_buffer_dirty(bh);	/* Expose it to the VM */
+}
+
+void __jbd2_journal_unfile_buffer(struct journal_head *jh)
+{
+	__jbd2_journal_temp_unlink_buffer(jh);
+	jh->b_transaction = NULL;
+}
+
+void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
+{
+	jbd_lock_bh_state(jh2bh(jh));
+	spin_lock(&journal->j_list_lock);
+	__jbd2_journal_unfile_buffer(jh);
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(jh2bh(jh));
+}
+
+/*
+ * Called from jbd2_journal_try_to_free_buffers().
+ *
+ * Called under jbd_lock_bh_state(bh)
+ */
+static void
+__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
+{
+	struct journal_head *jh;
+
+	jh = bh2jh(bh);
+
+	if (buffer_locked(bh) || buffer_dirty(bh))
+		goto out;
+
+	if (jh->b_next_transaction != 0)
+		goto out;
+
+	spin_lock(&journal->j_list_lock);
+	if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
+		if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
+			/* A written-back ordered data buffer */
+			JBUFFER_TRACE(jh, "release data");
+			__jbd2_journal_unfile_buffer(jh);
+			jbd2_journal_remove_journal_head(bh);
+			__brelse(bh);
+		}
+	} else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
+		/* written-back checkpointed metadata buffer */
+		if (jh->b_jlist == BJ_None) {
+			JBUFFER_TRACE(jh, "remove from checkpoint list");
+			__jbd2_journal_remove_checkpoint(jh);
+			jbd2_journal_remove_journal_head(bh);
+			__brelse(bh);
+		}
+	}
+	spin_unlock(&journal->j_list_lock);
+out:
+	return;
+}
+
+
+/**
+ * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
+ * @journal: journal for operation
+ * @page: to try and free
+ * @unused_gfp_mask: unused
+ *
+ *
+ * For all the buffers on this page,
+ * if they are fully written out ordered data, move them onto BUF_CLEAN
+ * so try_to_free_buffers() can reap them.
+ *
+ * This function returns non-zero if we wish try_to_free_buffers()
+ * to be called. We do this if the page is releasable by try_to_free_buffers().
+ * We also do it if the page has locked or dirty buffers and the caller wants
+ * us to perform sync or async writeout.
+ *
+ * This complicates JBD locking somewhat.  We aren't protected by the
+ * BKL here.  We wish to remove the buffer from its committing or
+ * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
+ *
+ * This may *change* the value of transaction_t->t_datalist, so anyone
+ * who looks at t_datalist needs to lock against this function.
+ *
+ * Even worse, someone may be doing a jbd2_journal_dirty_data on this
+ * buffer.  So we need to lock against that.  jbd2_journal_dirty_data()
+ * will come out of the lock with the buffer dirty, which makes it
+ * ineligible for release here.
+ *
+ * Who else is affected by this?  hmm...  Really the only contender
+ * is do_get_write_access() - it could be looking at the buffer while
+ * journal_try_to_free_buffer() is changing its state.  But that
+ * cannot happen because we never reallocate freed data as metadata
+ * while the data is part of a transaction.  Yes?
+ */
+int jbd2_journal_try_to_free_buffers(journal_t *journal,
+				struct page *page, gfp_t unused_gfp_mask)
+{
+	struct buffer_head *head;
+	struct buffer_head *bh;
+	int ret = 0;
+
+	J_ASSERT(PageLocked(page));
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		struct journal_head *jh;
+
+		/*
+		 * We take our own ref against the journal_head here to avoid
+		 * having to add tons of locking around each instance of
+		 * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
+		 */
+		jh = jbd2_journal_grab_journal_head(bh);
+		if (!jh)
+			continue;
+
+		jbd_lock_bh_state(bh);
+		__journal_try_to_free_buffer(journal, bh);
+		jbd2_journal_put_journal_head(jh);
+		jbd_unlock_bh_state(bh);
+		if (buffer_jbd(bh))
+			goto busy;
+	} while ((bh = bh->b_this_page) != head);
+	ret = try_to_free_buffers(page);
+busy:
+	return ret;
+}
+
+/*
+ * This buffer is no longer needed.  If it is on an older transaction's
+ * checkpoint list we need to record it on this transaction's forget list
+ * to pin this buffer (and hence its checkpointing transaction) down until
+ * this transaction commits.  If the buffer isn't on a checkpoint list, we
+ * release it.
+ * Returns non-zero if JBD no longer has an interest in the buffer.
+ *
+ * Called under j_list_lock.
+ *
+ * Called under jbd_lock_bh_state(bh).
+ */
+static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
+{
+	int may_free = 1;
+	struct buffer_head *bh = jh2bh(jh);
+
+	__jbd2_journal_unfile_buffer(jh);
+
+	if (jh->b_cp_transaction) {
+		JBUFFER_TRACE(jh, "on running+cp transaction");
+		__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
+		clear_buffer_jbddirty(bh);
+		may_free = 0;
+	} else {
+		JBUFFER_TRACE(jh, "on running transaction");
+		jbd2_journal_remove_journal_head(bh);
+		__brelse(bh);
+	}
+	return may_free;
+}
+
+/*
+ * jbd2_journal_invalidatepage
+ *
+ * This code is tricky.  It has a number of cases to deal with.
+ *
+ * There are two invariants which this code relies on:
+ *
+ * i_size must be updated on disk before we start calling invalidatepage on the
+ * data.
+ *
+ *  This is done in ext3 by defining an ext3_setattr method which
+ *  updates i_size before truncate gets going.  By maintaining this
+ *  invariant, we can be sure that it is safe to throw away any buffers
+ *  attached to the current transaction: once the transaction commits,
+ *  we know that the data will not be needed.
+ *
+ *  Note however that we can *not* throw away data belonging to the
+ *  previous, committing transaction!
+ *
+ * Any disk blocks which *are* part of the previous, committing
+ * transaction (and which therefore cannot be discarded immediately) are
+ * not going to be reused in the new running transaction
+ *
+ *  The bitmap committed_data images guarantee this: any block which is
+ *  allocated in one transaction and removed in the next will be marked
+ *  as in-use in the committed_data bitmap, so cannot be reused until
+ *  the next transaction to delete the block commits.  This means that
+ *  leaving committing buffers dirty is quite safe: the disk blocks
+ *  cannot be reallocated to a different file and so buffer aliasing is
+ *  not possible.
+ *
+ *
+ * The above applies mainly to ordered data mode.  In writeback mode we
+ * don't make guarantees about the order in which data hits disk --- in
+ * particular we don't guarantee that new dirty data is flushed before
+ * transaction commit --- so it is always safe just to discard data
+ * immediately in that mode.  --sct
+ */
+
+/*
+ * The journal_unmap_buffer helper function returns zero if the buffer
+ * concerned remains pinned as an anonymous buffer belonging to an older
+ * transaction.
+ *
+ * We're outside-transaction here.  Either or both of j_running_transaction
+ * and j_committing_transaction may be NULL.
+ */
+static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
+{
+	transaction_t *transaction;
+	struct journal_head *jh;
+	int may_free = 1;
+	int ret;
+
+	BUFFER_TRACE(bh, "entry");
+
+	/*
+	 * It is safe to proceed here without the j_list_lock because the
+	 * buffers cannot be stolen by try_to_free_buffers as long as we are
+	 * holding the page lock. --sct
+	 */
+
+	if (!buffer_jbd(bh))
+		goto zap_buffer_unlocked;
+
+	spin_lock(&journal->j_state_lock);
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+
+	jh = jbd2_journal_grab_journal_head(bh);
+	if (!jh)
+		goto zap_buffer_no_jh;
+
+	transaction = jh->b_transaction;
+	if (transaction == NULL) {
+		/* First case: not on any transaction.  If it
+		 * has no checkpoint link, then we can zap it:
+		 * it's a writeback-mode buffer so we don't care
+		 * if it hits disk safely. */
+		if (!jh->b_cp_transaction) {
+			JBUFFER_TRACE(jh, "not on any transaction: zap");
+			goto zap_buffer;
+		}
+
+		if (!buffer_dirty(bh)) {
+			/* bdflush has written it.  We can drop it now */
+			goto zap_buffer;
+		}
+
+		/* OK, it must be in the journal but still not
+		 * written fully to disk: it's metadata or
+		 * journaled data... */
+
+		if (journal->j_running_transaction) {
+			/* ... and once the current transaction has
+			 * committed, the buffer won't be needed any
+			 * longer. */
+			JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
+			ret = __dispose_buffer(jh,
+					journal->j_running_transaction);
+			jbd2_journal_put_journal_head(jh);
+			spin_unlock(&journal->j_list_lock);
+			jbd_unlock_bh_state(bh);
+			spin_unlock(&journal->j_state_lock);
+			return ret;
+		} else {
+			/* There is no currently-running transaction. So the
+			 * orphan record which we wrote for this file must have
+			 * passed into commit.  We must attach this buffer to
+			 * the committing transaction, if it exists. */
+			if (journal->j_committing_transaction) {
+				JBUFFER_TRACE(jh, "give to committing trans");
+				ret = __dispose_buffer(jh,
+					journal->j_committing_transaction);
+				jbd2_journal_put_journal_head(jh);
+				spin_unlock(&journal->j_list_lock);
+				jbd_unlock_bh_state(bh);
+				spin_unlock(&journal->j_state_lock);
+				return ret;
+			} else {
+				/* The orphan record's transaction has
+				 * committed.  We can cleanse this buffer */
+				clear_buffer_jbddirty(bh);
+				goto zap_buffer;
+			}
+		}
+	} else if (transaction == journal->j_committing_transaction) {
+		if (jh->b_jlist == BJ_Locked) {
+			/*
+			 * The buffer is on the committing transaction's locked
+			 * list.  We have the buffer locked, so I/O has
+			 * completed.  So we can nail the buffer now.
+			 */
+			may_free = __dispose_buffer(jh, transaction);
+			goto zap_buffer;
+		}
+		/*
+		 * If it is committing, we simply cannot touch it.  We
+		 * can remove it's next_transaction pointer from the
+		 * running transaction if that is set, but nothing
+		 * else. */
+		JBUFFER_TRACE(jh, "on committing transaction");
+		set_buffer_freed(bh);
+		if (jh->b_next_transaction) {
+			J_ASSERT(jh->b_next_transaction ==
+					journal->j_running_transaction);
+			jh->b_next_transaction = NULL;
+		}
+		jbd2_journal_put_journal_head(jh);
+		spin_unlock(&journal->j_list_lock);
+		jbd_unlock_bh_state(bh);
+		spin_unlock(&journal->j_state_lock);
+		return 0;
+	} else {
+		/* Good, the buffer belongs to the running transaction.
+		 * We are writing our own transaction's data, not any
+		 * previous one's, so it is safe to throw it away
+		 * (remember that we expect the filesystem to have set
+		 * i_size already for this truncate so recovery will not
+		 * expose the disk blocks we are discarding here.) */
+		J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
+		may_free = __dispose_buffer(jh, transaction);
+	}
+
+zap_buffer:
+	jbd2_journal_put_journal_head(jh);
+zap_buffer_no_jh:
+	spin_unlock(&journal->j_list_lock);
+	jbd_unlock_bh_state(bh);
+	spin_unlock(&journal->j_state_lock);
+zap_buffer_unlocked:
+	clear_buffer_dirty(bh);
+	J_ASSERT_BH(bh, !buffer_jbddirty(bh));
+	clear_buffer_mapped(bh);
+	clear_buffer_req(bh);
+	clear_buffer_new(bh);
+	bh->b_bdev = NULL;
+	return may_free;
+}
+
+/**
+ * void jbd2_journal_invalidatepage()
+ * @journal: journal to use for flush...
+ * @page:    page to flush
+ * @offset:  length of page to invalidate.
+ *
+ * Reap page buffers containing data after offset in page.
+ *
+ */
+void jbd2_journal_invalidatepage(journal_t *journal,
+		      struct page *page,
+		      unsigned long offset)
+{
+	struct buffer_head *head, *bh, *next;
+	unsigned int curr_off = 0;
+	int may_free = 1;
+
+	if (!PageLocked(page))
+		BUG();
+	if (!page_has_buffers(page))
+		return;
+
+	/* We will potentially be playing with lists other than just the
+	 * data lists (especially for journaled data mode), so be
+	 * cautious in our locking. */
+
+	head = bh = page_buffers(page);
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+		next = bh->b_this_page;
+
+		if (offset <= curr_off) {
+			/* This block is wholly outside the truncation point */
+			lock_buffer(bh);
+			may_free &= journal_unmap_buffer(journal, bh);
+			unlock_buffer(bh);
+		}
+		curr_off = next_off;
+		bh = next;
+
+	} while (bh != head);
+
+	if (!offset) {
+		if (may_free && try_to_free_buffers(page))
+			J_ASSERT(!page_has_buffers(page));
+	}
+}
+
+/*
+ * File a buffer on the given transaction list.
+ */
+void __jbd2_journal_file_buffer(struct journal_head *jh,
+			transaction_t *transaction, int jlist)
+{
+	struct journal_head **list = NULL;
+	int was_dirty = 0;
+	struct buffer_head *bh = jh2bh(jh);
+
+	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	assert_spin_locked(&transaction->t_journal->j_list_lock);
+
+	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+	J_ASSERT_JH(jh, jh->b_transaction == transaction ||
+				jh->b_transaction == 0);
+
+	if (jh->b_transaction && jh->b_jlist == jlist)
+		return;
+
+	/* The following list of buffer states needs to be consistent
+	 * with __jbd_unexpected_dirty_buffer()'s handling of dirty
+	 * state. */
+
+	if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
+	    jlist == BJ_Shadow || jlist == BJ_Forget) {
+		if (test_clear_buffer_dirty(bh) ||
+		    test_clear_buffer_jbddirty(bh))
+			was_dirty = 1;
+	}
+
+	if (jh->b_transaction)
+		__jbd2_journal_temp_unlink_buffer(jh);
+	jh->b_transaction = transaction;
+
+	switch (jlist) {
+	case BJ_None:
+		J_ASSERT_JH(jh, !jh->b_committed_data);
+		J_ASSERT_JH(jh, !jh->b_frozen_data);
+		return;
+	case BJ_SyncData:
+		list = &transaction->t_sync_datalist;
+		break;
+	case BJ_Metadata:
+		transaction->t_nr_buffers++;
+		list = &transaction->t_buffers;
+		break;
+	case BJ_Forget:
+		list = &transaction->t_forget;
+		break;
+	case BJ_IO:
+		list = &transaction->t_iobuf_list;
+		break;
+	case BJ_Shadow:
+		list = &transaction->t_shadow_list;
+		break;
+	case BJ_LogCtl:
+		list = &transaction->t_log_list;
+		break;
+	case BJ_Reserved:
+		list = &transaction->t_reserved_list;
+		break;
+	case BJ_Locked:
+		list =  &transaction->t_locked_list;
+		break;
+	}
+
+	__blist_add_buffer(list, jh);
+	jh->b_jlist = jlist;
+
+	if (was_dirty)
+		set_buffer_jbddirty(bh);
+}
+
+void jbd2_journal_file_buffer(struct journal_head *jh,
+				transaction_t *transaction, int jlist)
+{
+	jbd_lock_bh_state(jh2bh(jh));
+	spin_lock(&transaction->t_journal->j_list_lock);
+	__jbd2_journal_file_buffer(jh, transaction, jlist);
+	spin_unlock(&transaction->t_journal->j_list_lock);
+	jbd_unlock_bh_state(jh2bh(jh));
+}
+
+/*
+ * Remove a buffer from its current buffer list in preparation for
+ * dropping it from its current transaction entirely.  If the buffer has
+ * already started to be used by a subsequent transaction, refile the
+ * buffer on that transaction's metadata list.
+ *
+ * Called under journal->j_list_lock
+ *
+ * Called under jbd_lock_bh_state(jh2bh(jh))
+ */
+void __jbd2_journal_refile_buffer(struct journal_head *jh)
+{
+	int was_dirty;
+	struct buffer_head *bh = jh2bh(jh);
+
+	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	if (jh->b_transaction)
+		assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
+
+	/* If the buffer is now unused, just drop it. */
+	if (jh->b_next_transaction == NULL) {
+		__jbd2_journal_unfile_buffer(jh);
+		return;
+	}
+
+	/*
+	 * It has been modified by a later transaction: add it to the new
+	 * transaction's metadata list.
+	 */
+
+	was_dirty = test_clear_buffer_jbddirty(bh);
+	__jbd2_journal_temp_unlink_buffer(jh);
+	jh->b_transaction = jh->b_next_transaction;
+	jh->b_next_transaction = NULL;
+	__jbd2_journal_file_buffer(jh, jh->b_transaction,
+				was_dirty ? BJ_Metadata : BJ_Reserved);
+	J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
+
+	if (was_dirty)
+		set_buffer_jbddirty(bh);
+}
+
+/*
+ * For the unlocked version of this call, also make sure that any
+ * hanging journal_head is cleaned up if necessary.
+ *
+ * __jbd2_journal_refile_buffer is usually called as part of a single locked
+ * operation on a buffer_head, in which the caller is probably going to
+ * be hooking the journal_head onto other lists.  In that case it is up
+ * to the caller to remove the journal_head if necessary.  For the
+ * unlocked jbd2_journal_refile_buffer call, the caller isn't going to be
+ * doing anything else to the buffer so we need to do the cleanup
+ * ourselves to avoid a jh leak.
+ *
+ * *** The journal_head may be freed by this call! ***
+ */
+void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
+{
+	struct buffer_head *bh = jh2bh(jh);
+
+	jbd_lock_bh_state(bh);
+	spin_lock(&journal->j_list_lock);
+
+	__jbd2_journal_refile_buffer(jh);
+	jbd_unlock_bh_state(bh);
+	jbd2_journal_remove_journal_head(bh);
+
+	spin_unlock(&journal->j_list_lock);
+	__brelse(bh);
+}
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 93068697a9bf..3f7899ea4cba 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -364,12 +364,11 @@ jffs_new_inode(const struct inode * dir, struct jffs_raw_inode *raw_inode,
 	inode->i_ctime.tv_nsec = 0;
 	inode->i_mtime.tv_nsec = 0;
 	inode->i_atime.tv_nsec = 0;
-	inode->i_blksize = PAGE_SIZE;
 	inode->i_blocks = (inode->i_size + 511) >> 9;
 
 	f = jffs_find_file(c, raw_inode->ino);
 
-	inode->u.generic_ip = (void *)f;
+	inode->i_private = (void *)f;
 	insert_inode_hash(inode);
 
 	return inode;
@@ -442,7 +441,7 @@ jffs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	});
 
 	result = -ENOTDIR;
-	if (!(old_dir_f = (struct jffs_file *)old_dir->u.generic_ip)) {
+	if (!(old_dir_f = old_dir->i_private)) {
 		D(printk("jffs_rename(): Old dir invalid.\n"));
 		goto jffs_rename_end;
 	}
@@ -456,7 +455,7 @@ jffs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	/* Find the new directory.  */
 	result = -ENOTDIR;
-	if (!(new_dir_f = (struct jffs_file *)new_dir->u.generic_ip)) {
+	if (!(new_dir_f = new_dir->i_private)) {
 		D(printk("jffs_rename(): New dir invalid.\n"));
 		goto jffs_rename_end;
 	}
@@ -593,7 +592,7 @@ jffs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		}
 		else {
 			ddino = ((struct jffs_file *)
-				 inode->u.generic_ip)->pino;
+				 inode->i_private)->pino;
 		}
 		D3(printk("jffs_readdir(): \"..\" %u\n", ddino));
 		if (filldir(dirent, "..", 2, filp->f_pos, ddino, DT_DIR) < 0) {
@@ -604,7 +603,7 @@ jffs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		}
 		filp->f_pos++;
 	}
-	f = ((struct jffs_file *)inode->u.generic_ip)->children;
+	f = ((struct jffs_file *)inode->i_private)->children;
 
 	j = 2;
 	while(f && (f->deleted || j++ < filp->f_pos )) {
@@ -652,7 +651,7 @@ jffs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	lock_kernel();
 
 	D3({
-		char *s = (char *)kmalloc(len + 1, GFP_KERNEL);
+		char *s = kmalloc(len + 1, GFP_KERNEL);
 		memcpy(s, name, len);
 		s[len] = '\0';
 		printk("jffs_lookup(): dir: 0x%p, name: \"%s\"\n", dir, s);
@@ -668,7 +667,7 @@ jffs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	}
 
 	r = -EACCES;
-	if (!(d = (struct jffs_file *)dir->u.generic_ip)) {
+	if (!(d = (struct jffs_file *)dir->i_private)) {
 		D(printk("jffs_lookup(): No such inode! (%lu)\n",
 			 dir->i_ino));
 		goto jffs_lookup_end;
@@ -739,7 +738,7 @@ jffs_do_readpage_nolock(struct file *file, struct page *page)
 	unsigned long read_len;
 	int result;
 	struct inode *inode = (struct inode*)page->mapping->host;
-	struct jffs_file *f = (struct jffs_file *)inode->u.generic_ip;
+	struct jffs_file *f = (struct jffs_file *)inode->i_private;
 	struct jffs_control *c = (struct jffs_control *)inode->i_sb->s_fs_info;
 	int r;
 	loff_t offset;
@@ -828,7 +827,7 @@ jffs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	});
 
 	lock_kernel();
-	dir_f = (struct jffs_file *)dir->u.generic_ip;
+	dir_f = dir->i_private;
 
 	ASSERT(if (!dir_f) {
 		printk(KERN_ERR "jffs_mkdir(): No reference to a "
@@ -972,7 +971,7 @@ jffs_remove(struct inode *dir, struct dentry *dentry, int type)
 		kfree(_name);
 	});
 
-	dir_f = (struct jffs_file *) dir->u.generic_ip;
+	dir_f = dir->i_private;
 	c = dir_f->c;
 
 	result = -ENOENT;
@@ -1053,9 +1052,8 @@ jffs_remove(struct inode *dir, struct dentry *dentry, int type)
 
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
 	mark_inode_dirty(dir);
-	inode->i_nlink--;
 	inode->i_ctime = dir->i_ctime;
-	mark_inode_dirty(inode);
+	inode_dec_link_count(inode);
 
 	d_delete(dentry);	/* This also frees the inode */
 
@@ -1082,7 +1080,7 @@ jffs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	if (!old_valid_dev(rdev))
 		return -EINVAL;
 	lock_kernel();
-	dir_f = (struct jffs_file *)dir->u.generic_ip;
+	dir_f = dir->i_private;
 	c = dir_f->c;
 
 	D3(printk (KERN_NOTICE "mknod(): down biglock\n"));
@@ -1173,8 +1171,8 @@ jffs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 	lock_kernel();
 	D1({
 		int len = dentry->d_name.len; 
-		char *_name = (char *)kmalloc(len + 1, GFP_KERNEL);
-		char *_symname = (char *)kmalloc(symname_len + 1, GFP_KERNEL);
+		char *_name = kmalloc(len + 1, GFP_KERNEL);
+		char *_symname = kmalloc(symname_len + 1, GFP_KERNEL);
 		memcpy(_name, dentry->d_name.name, len);
 		_name[len] = '\0';
 		memcpy(_symname, symname, symname_len);
@@ -1186,7 +1184,7 @@ jffs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 		kfree(_symname);
 	});
 
-	dir_f = (struct jffs_file *)dir->u.generic_ip;
+	dir_f = dir->i_private;
 	ASSERT(if (!dir_f) {
 		printk(KERN_ERR "jffs_symlink(): No reference to a "
 		       "jffs_file struct in inode.\n");
@@ -1282,14 +1280,14 @@ jffs_create(struct inode *dir, struct dentry *dentry, int mode,
 	lock_kernel();
 	D1({
 		int len = dentry->d_name.len;
-		char *s = (char *)kmalloc(len + 1, GFP_KERNEL);
+		char *s = kmalloc(len + 1, GFP_KERNEL);
 		memcpy(s, dentry->d_name.name, len);
 		s[len] = '\0';
 		printk("jffs_create(): dir: 0x%p, name: \"%s\"\n", dir, s);
 		kfree(s);
 	});
 
-	dir_f = (struct jffs_file *)dir->u.generic_ip;
+	dir_f = dir->i_private;
 	ASSERT(if (!dir_f) {
 		printk(KERN_ERR "jffs_create(): No reference to a "
 		       "jffs_file struct in inode.\n");
@@ -1403,9 +1401,9 @@ jffs_file_write(struct file *filp, const char *buf, size_t count,
 		goto out_isem;
 	}
 
-	if (!(f = (struct jffs_file *)inode->u.generic_ip)) {
-		D(printk("jffs_file_write(): inode->u.generic_ip = 0x%p\n",
-				inode->u.generic_ip));
+	if (!(f = inode->i_private)) {
+		D(printk("jffs_file_write(): inode->i_private = 0x%p\n",
+				inode->i_private));
 		goto out_isem;
 	}
 
@@ -1633,8 +1631,10 @@ static const struct file_operations jffs_file_operations =
 {
 	.open		= generic_file_open,
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.ioctl		= jffs_ioctl,
 	.mmap		= generic_file_readonly_mmap,
 	.fsync		= jffs_fsync,
@@ -1693,7 +1693,7 @@ jffs_read_inode(struct inode *inode)
 		mutex_unlock(&c->fmc->biglock);
 		return;
 	}
-	inode->u.generic_ip = (void *)f;
+	inode->i_private = f;
 	inode->i_mode = f->mode;
 	inode->i_nlink = f->nlink;
 	inode->i_uid = f->uid;
@@ -1706,7 +1706,6 @@ jffs_read_inode(struct inode *inode)
 	inode->i_mtime.tv_nsec = 
 	inode->i_ctime.tv_nsec = 0;
 
-	inode->i_blksize = PAGE_SIZE;
 	inode->i_blocks = (inode->i_size + 511) >> 9;
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &jffs_file_inode_operations;
@@ -1748,7 +1747,7 @@ jffs_delete_inode(struct inode *inode)
 	lock_kernel();
 	inode->i_size = 0;
 	inode->i_blocks = 0;
-	inode->u.generic_ip = NULL;
+	inode->i_private = NULL;
 	clear_inode(inode);
 	if (inode->i_nlink == 0) {
 		c = (struct jffs_control *) inode->i_sb->s_fs_info;
diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c
index 9000f1effedf..4a543e114970 100644
--- a/fs/jffs/intrep.c
+++ b/fs/jffs/intrep.c
@@ -488,13 +488,11 @@ jffs_create_file(struct jffs_control *c,
 {
 	struct jffs_file *f;
 
-	if (!(f = (struct jffs_file *)kmalloc(sizeof(struct jffs_file),
-					      GFP_KERNEL))) {
+	if (!(f = kzalloc(sizeof(*f), GFP_KERNEL))) {
 		D(printk("jffs_create_file(): Failed!\n"));
 		return NULL;
 	}
 	no_jffs_file++;
-	memset(f, 0, sizeof(struct jffs_file));
 	f->ino = raw_inode->ino;
 	f->pino = raw_inode->pino;
 	f->nlink = raw_inode->nlink;
@@ -516,7 +514,7 @@ jffs_create_control(struct super_block *sb)
 
 	D2(printk("jffs_create_control()\n"));
 
-	if (!(c = (struct jffs_control *)kmalloc(s, GFP_KERNEL))) {
+	if (!(c = kmalloc(s, GFP_KERNEL))) {
 		goto fail_control;
 	}
 	DJM(no_jffs_control++);
@@ -524,7 +522,7 @@ jffs_create_control(struct super_block *sb)
 	c->gc_task = NULL;
 	c->hash_len = JFFS_HASH_SIZE;
 	s = sizeof(struct list_head) * c->hash_len;
-	if (!(c->hash = (struct list_head *)kmalloc(s, GFP_KERNEL))) {
+	if (!(c->hash = kmalloc(s, GFP_KERNEL))) {
 		goto fail_hash;
 	}
 	DJM(no_hash++);
@@ -593,8 +591,7 @@ jffs_add_virtual_root(struct jffs_control *c)
 	D2(printk("jffs_add_virtual_root(): "
 		  "Creating a virtual root directory.\n"));
 
-	if (!(root = (struct jffs_file *)kmalloc(sizeof(struct jffs_file),
-						 GFP_KERNEL))) {
+	if (!(root = kmalloc(sizeof(struct jffs_file), GFP_KERNEL))) {
 		return -ENOMEM;
 	}
 	no_jffs_file++;
diff --git a/fs/jffs/jffs_fm.c b/fs/jffs/jffs_fm.c
index 7d8ca1aeace2..29b68d939bd9 100644
--- a/fs/jffs/jffs_fm.c
+++ b/fs/jffs/jffs_fm.c
@@ -94,8 +94,7 @@ jffs_build_begin(struct jffs_control *c, int unit)
 	struct mtd_info *mtd;
 	
 	D3(printk("jffs_build_begin()\n"));
-	fmc = (struct jffs_fmcontrol *)kmalloc(sizeof(struct jffs_fmcontrol),
-					       GFP_KERNEL);
+	fmc = kmalloc(sizeof(*fmc), GFP_KERNEL);
 	if (!fmc) {
 		D(printk("jffs_build_begin(): Allocation of "
 			 "struct jffs_fmcontrol failed!\n"));
@@ -486,8 +485,7 @@ jffs_add_node(struct jffs_node *node)
 
 	D3(printk("jffs_add_node(): ino = %u\n", node->ino));
 
-	ref = (struct jffs_node_ref *)kmalloc(sizeof(struct jffs_node_ref),
-					      GFP_KERNEL);
+	ref = kmalloc(sizeof(*ref), GFP_KERNEL);
 	if (!ref)
 		return -ENOMEM;
 
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index edd8371fc6a5..9def6adf4a5d 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -588,7 +588,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
 	}
 
 	dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
-	dir_i->i_nlink++;
+	inc_nlink(dir_i);
 
 	jffs2_free_raw_dirent(rd);
 
@@ -615,7 +615,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
 	}
 	ret = jffs2_unlink(dir_i, dentry);
 	if (!ret)
-		dir_i->i_nlink--;
+		drop_nlink(dir_i);
 	return ret;
 }
 
@@ -823,7 +823,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 
 	if (victim_f) {
 		/* There was a victim. Kill it off nicely */
-		new_dentry->d_inode->i_nlink--;
+		drop_nlink(new_dentry->d_inode);
 		/* Don't oops if the victim was a dirent pointing to an
 		   inode which didn't exist. */
 		if (victim_f->inocache) {
@@ -836,7 +836,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 	/* If it was a directory we moved, and there was no victim,
 	   increase i_nlink on its new parent */
 	if (S_ISDIR(old_dentry->d_inode->i_mode) && !victim_f)
-		new_dir_i->i_nlink++;
+		inc_nlink(new_dir_i);
 
 	/* Unlink the original */
 	ret = jffs2_do_unlink(c, JFFS2_INODE_INFO(old_dir_i),
@@ -848,7 +848,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 		/* Oh shit. We really ought to make a single node which can do both atomically */
 		struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode);
 		down(&f->sem);
-		old_dentry->d_inode->i_nlink++;
+		inc_nlink(old_dentry->d_inode);
 		if (f->inocache)
 			f->inocache->nlink++;
 		up(&f->sem);
@@ -862,7 +862,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 	}
 
 	if (S_ISDIR(old_dentry->d_inode->i_mode))
-		old_dir_i->i_nlink--;
+		drop_nlink(old_dir_i);
 
 	new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = ITIME(now);
 
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 3ed6e3e120b6..242875f77cb3 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -42,8 +42,10 @@ const struct file_operations jffs2_file_operations =
 {
 	.llseek =	generic_file_llseek,
 	.open =		generic_file_open,
-	.read =		generic_file_read,
-	.write =	generic_file_write,
+ 	.read =		do_sync_read,
+ 	.aio_read =	generic_file_aio_read,
+ 	.write =	do_sync_write,
+ 	.aio_write =	generic_file_aio_write,
 	.ioctl =	jffs2_ioctl,
 	.mmap =		generic_file_readonly_mmap,
 	.fsync =	jffs2_fsync,
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 4780f82825d6..7bc1a4201c0c 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -263,7 +263,6 @@ void jffs2_read_inode (struct inode *inode)
 
 	inode->i_nlink = f->inocache->nlink;
 
-	inode->i_blksize = PAGE_SIZE;
 	inode->i_blocks = (inode->i_size + 511) >> 9;
 
 	switch (inode->i_mode & S_IFMT) {
@@ -278,13 +277,13 @@ void jffs2_read_inode (struct inode *inode)
 
 		for (fd=f->dents; fd; fd = fd->next) {
 			if (fd->type == DT_DIR && fd->ino)
-				inode->i_nlink++;
+				inc_nlink(inode);
 		}
 		/* and '..' */
-		inode->i_nlink++;
+		inc_nlink(inode);
 		/* Root dir gets i_nlink 3 for some reason */
 		if (inode->i_ino == 1)
-			inode->i_nlink++;
+			inc_nlink(inode);
 
 		inode->i_op = &jffs2_dir_inode_operations;
 		inode->i_fop = &jffs2_dir_operations;
@@ -449,7 +448,6 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
 	inode->i_atime = inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
 	ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime));
 
-	inode->i_blksize = PAGE_SIZE;
 	inode->i_blocks = 0;
 	inode->i_size = 0;
 
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 68e3953419b4..bc4b8106a490 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -119,10 +119,9 @@ static int jffs2_get_sb_mtd(struct file_system_type *fs_type,
 	struct jffs2_sb_info *c;
 	int ret;
 
-	c = kmalloc(sizeof(*c), GFP_KERNEL);
+	c = kzalloc(sizeof(*c), GFP_KERNEL);
 	if (!c)
 		return -ENOMEM;
-	memset(c, 0, sizeof(*c));
 	c->mtd = mtd;
 
 	sb = sget(fs_type, jffs2_sb_compare, jffs2_sb_set, c);
@@ -335,10 +334,10 @@ static int __init init_jffs2_fs(void)
 	   which means just 'no padding', without the alignment
 	   thing. But GCC doesn't have that -- we have to just
 	   hope the structs are the right sizes, instead. */
-	BUG_ON(sizeof(struct jffs2_unknown_node) != 12);
-	BUG_ON(sizeof(struct jffs2_raw_dirent) != 40);
-	BUG_ON(sizeof(struct jffs2_raw_inode) != 68);
-	BUG_ON(sizeof(struct jffs2_raw_summary) != 32);
+	BUILD_BUG_ON(sizeof(struct jffs2_unknown_node) != 12);
+	BUILD_BUG_ON(sizeof(struct jffs2_raw_dirent) != 40);
+	BUILD_BUG_ON(sizeof(struct jffs2_raw_inode) != 68);
+	BUILD_BUG_ON(sizeof(struct jffs2_raw_summary) != 32);
 
 	printk(KERN_INFO "JFFS2 version 2.2."
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index e2281300979c..4d84bdc88299 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -5,16 +5,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
@@ -183,7 +183,7 @@ cleanup:
 		posix_acl_release(acl);
 	} else
 		inode->i_mode &= ~current->fs->umask;
-	
+
 	JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |
 			       inode->i_mode;
 
diff --git a/fs/jfs/endian24.h b/fs/jfs/endian24.h
index ab7cd0567c95..79494c4f2b10 100644
--- a/fs/jfs/endian24.h
+++ b/fs/jfs/endian24.h
@@ -1,5 +1,5 @@
 /*
- *   Copyright (c) International Business Machines Corp., 2001
+ *   Copyright (C) International Business Machines Corp., 2001
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 1c9745be5ada..34181b8f5a0a 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -4,16 +4,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
@@ -103,14 +103,12 @@ struct inode_operations jfs_file_inode_operations = {
 const struct file_operations jfs_file_operations = {
 	.open		= jfs_open,
 	.llseek		= generic_file_llseek,
-	.write		= generic_file_write,
-	.read		= generic_file_read,
+	.write		= do_sync_write,
+	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
-	.readv		= generic_file_readv,
-	.writev		= generic_file_writev,
- 	.sendfile	= generic_file_sendfile,
+	.sendfile	= generic_file_sendfile,
 	.fsync		= jfs_fsync,
 	.release	= jfs_release,
 	.ioctl		= jfs_ioctl,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index a223cf4faa9b..f5719117edfe 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -4,16 +4,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
@@ -33,7 +33,7 @@
 
 void jfs_read_inode(struct inode *inode)
 {
-	if (diRead(inode)) { 
+	if (diRead(inode)) {
 		make_bad_inode(inode);
 		return;
 	}
@@ -227,7 +227,7 @@ int jfs_get_block(struct inode *ip, sector_t lblock,
 #ifdef _JFS_4K
 	if ((rc = extHint(ip, lblock64 << ip->i_sb->s_blocksize_bits, &xad)))
 		goto unlock;
-	rc = extAlloc(ip, xlen, lblock64, &xad, FALSE);
+	rc = extAlloc(ip, xlen, lblock64, &xad, false);
 	if (rc)
 		goto unlock;
 
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 67b3774820eb..37db52488262 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -6,7 +6,6 @@
  */
 
 #include <linux/fs.h>
-#include <linux/ext2_fs.h>
 #include <linux/ctype.h>
 #include <linux/capability.h>
 #include <linux/time.h>
@@ -22,13 +21,13 @@ static struct {
 	long jfs_flag;
 	long ext2_flag;
 } jfs_map[] = {
-	{JFS_NOATIME_FL, EXT2_NOATIME_FL},
-	{JFS_DIRSYNC_FL, EXT2_DIRSYNC_FL},
-	{JFS_SYNC_FL, EXT2_SYNC_FL},
-	{JFS_SECRM_FL, EXT2_SECRM_FL},
-	{JFS_UNRM_FL, EXT2_UNRM_FL},
-	{JFS_APPEND_FL, EXT2_APPEND_FL},
-	{JFS_IMMUTABLE_FL, EXT2_IMMUTABLE_FL},
+	{JFS_NOATIME_FL,	FS_NOATIME_FL},
+	{JFS_DIRSYNC_FL,	FS_DIRSYNC_FL},
+	{JFS_SYNC_FL,		FS_SYNC_FL},
+	{JFS_SECRM_FL,		FS_SECRM_FL},
+	{JFS_UNRM_FL,		FS_UNRM_FL},
+	{JFS_APPEND_FL,		FS_APPEND_FL},
+	{JFS_IMMUTABLE_FL,	FS_IMMUTABLE_FL},
 	{0, 0},
 };
 
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index a76293767c73..455fa4292045 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -1,18 +1,18 @@
 /*
- *   Copyright (c) International Business Machines  Corp., 2002
+ *   Copyright (C) International Business Machines  Corp., 2002
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_ACL
diff --git a/fs/jfs/jfs_btree.h b/fs/jfs/jfs_btree.h
index 7f3e9ac454ff..79c61805bd33 100644
--- a/fs/jfs/jfs_btree.h
+++ b/fs/jfs/jfs_btree.h
@@ -3,16 +3,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef	_H_JFS_BTREE
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index 81f0e514c490..9c5d59632aac 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -4,16 +4,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h
index 9f2572aea561..40b20111383c 100644
--- a/fs/jfs/jfs_dinode.h
+++ b/fs/jfs/jfs_dinode.h
@@ -1,18 +1,18 @@
 /*
- *   Copyright (c) International Business Machines Corp., 2000-2001
+ *   Copyright (C) International Business Machines Corp., 2000-2001
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_DINODE
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index c161c98954e0..23546c8fd48b 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -3,16 +3,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
@@ -30,28 +30,28 @@
  *
  *	the working state of the block allocation map is accessed in
  *	two directions:
- *	
+ *
  *	1) allocation and free requests that start at the dmap
  *	   level and move up through the dmap control pages (i.e.
  *	   the vast majority of requests).
- * 
- * 	2) allocation requests that start at dmap control page
+ *
+ *	2) allocation requests that start at dmap control page
  *	   level and work down towards the dmaps.
- *	
- *	the serialization scheme used here is as follows. 
  *
- *	requests which start at the bottom are serialized against each 
- *	other through buffers and each requests holds onto its buffers 
- *	as it works it way up from a single dmap to the required level 
+ *	the serialization scheme used here is as follows.
+ *
+ *	requests which start at the bottom are serialized against each
+ *	other through buffers and each requests holds onto its buffers
+ *	as it works it way up from a single dmap to the required level
  *	of dmap control page.
  *	requests that start at the top are serialized against each other
  *	and request that start from the bottom by the multiple read/single
  *	write inode lock of the bmap inode. requests starting at the top
  *	take this lock in write mode while request starting at the bottom
  *	take the lock in read mode.  a single top-down request may proceed
- *	exclusively while multiple bottoms-up requests may proceed 
- * 	simultaneously (under the protection of busy buffers).
- *	
+ *	exclusively while multiple bottoms-up requests may proceed
+ *	simultaneously (under the protection of busy buffers).
+ *
  *	in addition to information found in dmaps and dmap control pages,
  *	the working state of the block allocation map also includes read/
  *	write information maintained in the bmap descriptor (i.e. total
@@ -59,7 +59,7 @@
  *	a single exclusive lock (BMAP_LOCK) is used to guard this information
  *	in the face of multiple-bottoms up requests.
  *	(lock ordering: IREAD_LOCK, BMAP_LOCK);
- *	
+ *
  *	accesses to the persistent state of the block allocation map (limited
  *	to the persistent bitmaps in dmaps) is guarded by (busy) buffers.
  */
@@ -120,7 +120,7 @@ static int dbGetL2AGSize(s64 nblocks);
 /*
  *	buddy table
  *
- * table used for determining buddy sizes within characters of 
+ * table used for determining buddy sizes within characters of
  * dmap bitmap words.  the characters themselves serve as indexes
  * into the table, with the table elements yielding the maximum
  * binary buddy of free bits within the character.
@@ -146,7 +146,7 @@ static const s8 budtab[256] = {
 
 
 /*
- * NAME:    	dbMount()
+ * NAME:	dbMount()
  *
  * FUNCTION:	initializate the block allocation map.
  *
@@ -223,12 +223,12 @@ int dbMount(struct inode *ipbmap)
 
 
 /*
- * NAME:    	dbUnmount()
+ * NAME:	dbUnmount()
  *
  * FUNCTION:	terminate the block allocation map in preparation for
  *		file system unmount.
  *
- * 		the in-core bmap descriptor is written to disk and
+ *		the in-core bmap descriptor is written to disk and
  *		the memory for this descriptor is freed.
  *
  * PARAMETERS:
@@ -311,7 +311,7 @@ int dbSync(struct inode *ipbmap)
 
 
 /*
- * NAME:    	dbFree()
+ * NAME:	dbFree()
  *
  * FUNCTION:	free the specified block range from the working block
  *		allocation map.
@@ -397,14 +397,14 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
  *
  * FUNCTION:    update the allocation state (free or allocate) of the
  *		specified block range in the persistent block allocation map.
- *		
+ *
  *		the blocks will be updated in the persistent map one
  *		dmap at a time.
  *
  * PARAMETERS:
  *      ipbmap	-  pointer to in-core inode for the block map.
- *      free	- TRUE if block range is to be freed from the persistent
- *		  map; FALSE if it is to   be allocated.
+ *      free	-  'true' if block range is to be freed from the persistent
+ *		   map; 'false' if it is to   be allocated.
  *      blkno	-  starting block number of the range.
  *      nblocks	-  number of contiguous blocks in the range.
  *      tblk	-  transaction block;
@@ -475,7 +475,7 @@ dbUpdatePMap(struct inode *ipbmap,
 		/* update the bits of the dmap words. the first and last
 		 * words may only have a subset of their bits updated. if
 		 * this is the case, we'll work against that word (i.e.
-		 * partial first and/or last) only in a single pass.  a 
+		 * partial first and/or last) only in a single pass.  a
 		 * single pass will also be used to update all words that
 		 * are to have all their bits updated.
 		 */
@@ -662,11 +662,11 @@ unlock:
  *		the block allocation policy uses hints and a multi-step
  *		approach.
  *
- *	  	for allocation requests smaller than the number of blocks
+ *		for allocation requests smaller than the number of blocks
  *		per dmap, we first try to allocate the new blocks
  *		immediately following the hint.  if these blocks are not
  *		available, we try to allocate blocks near the hint.  if
- *		no blocks near the hint are available, we next try to 
+ *		no blocks near the hint are available, we next try to
  *		allocate within the same dmap as contains the hint.
  *
  *		if no blocks are available in the dmap or the allocation
@@ -713,7 +713,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
 #endif				/* _STILL_TO_PORT */
 
 	/* get the log2 number of blocks to be allocated.
-	 * if the number of blocks is not a log2 multiple, 
+	 * if the number of blocks is not a log2 multiple,
 	 * it will be rounded up to the next log2 multiple.
 	 */
 	l2nb = BLKSTOL2(nblocks);
@@ -906,7 +906,7 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
 	 * validate extent request:
 	 *
 	 * note: defragfs policy:
-	 *  max 64 blocks will be moved.  
+	 *  max 64 blocks will be moved.
 	 *  allocation request size must be satisfied from a single dmap.
 	 */
 	if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) {
@@ -1333,7 +1333,7 @@ dbAllocNear(struct bmap * bmp,
  *		or two sub-trees, depending on the allocation group size.
  *		we search the top nodes of these subtrees left to right for
  *		sufficient free space.  if sufficient free space is found,
- *		the subtree is searched to find the leftmost leaf that 
+ *		the subtree is searched to find the leftmost leaf that
  *		has free space.  once we have made it to the leaf, we
  *		move the search to the next lower level dmap control page
  *		corresponding to this leaf.  we continue down the dmap control
@@ -1398,7 +1398,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
 	 * that fully describes the allocation group since the allocation
 	 * group is already fully described by a dmap.  in this case, we
 	 * just call dbAllocCtl() to search the dmap tree and allocate the
-	 * required space if available.  
+	 * required space if available.
 	 *
 	 * if the allocation group is completely free, dbAllocCtl() is
 	 * also called to allocate the required space.  this is done for
@@ -1450,7 +1450,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
 	    (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth;
 	ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
 
-	/* dmap control page trees fan-out by 4 and a single allocation 
+	/* dmap control page trees fan-out by 4 and a single allocation
 	 * group may be described by 1 or 2 subtrees within the ag level
 	 * dmap control page, depending upon the ag size. examine the ag's
 	 * subtrees for sufficient free space, starting with the leftmost
@@ -1633,7 +1633,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
 
 	/* starting at the specified dmap control page level and block
 	 * number, search down the dmap control levels for the starting
-	 * block number of a dmap page that contains or starts off 
+	 * block number of a dmap page that contains or starts off
 	 * sufficient free blocks.
 	 */
 	for (lev = level, b = *blkno; lev >= 0; lev--) {
@@ -1677,7 +1677,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
 		}
 
 		/* adjust the block number to reflect the location within
-		 * the dmap control page (i.e. the leaf) at which free 
+		 * the dmap control page (i.e. the leaf) at which free
 		 * space was found.
 		 */
 		b += (((s64) leafidx) << budmin);
@@ -1700,12 +1700,12 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
  * NAME:	dbAllocCtl()
  *
  * FUNCTION:    attempt to allocate a specified number of contiguous
- *		blocks starting within a specific dmap.  
- *		
+ *		blocks starting within a specific dmap.
+ *
  *		this routine is called by higher level routines that search
  *		the dmap control pages above the actual dmaps for contiguous
  *		free space.  the result of successful searches by these
- * 		routines are the starting block numbers within dmaps, with
+ *		routines are the starting block numbers within dmaps, with
  *		the dmaps themselves containing the desired contiguous free
  *		space or starting a contiguous free space of desired size
  *		that is made up of the blocks of one or more dmaps. these
@@ -1872,14 +1872,14 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
  *
  * FUNCTION:    attempt to allocate a specified number of contiguous blocks
  *		from a specified dmap.
- *		
+ *
  *		this routine checks if the contiguous blocks are available.
  *		if so, nblocks of blocks are allocated; otherwise, ENOSPC is
  *		returned.
  *
  * PARAMETERS:
  *      mp	-  pointer to bmap descriptor
- *      dp	-  pointer to dmap to attempt to allocate blocks from. 
+ *      dp	-  pointer to dmap to attempt to allocate blocks from.
  *      l2nb	-  log2 number of contiguous block desired.
  *      nblocks	-  actual number of contiguous block desired.
  *      results	-  on successful return, set to the starting block number
@@ -1890,7 +1890,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
  *      -ENOSPC	- insufficient disk resources
  *      -EIO	- i/o error
  *
- * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or 
+ * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or
  *	IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit;
  */
 static int
@@ -2032,7 +2032,7 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
 
 	/* root changed. bubble the change up to the dmap control pages.
 	 * if the adjustment of the upper level control pages fails,
-	 * backout the deallocation. 
+	 * backout the deallocation.
 	 */
 	if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 0, 0))) {
 		word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
@@ -2245,7 +2245,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 	 * words (i.e. partial first and/or last) on an individual basis
 	 * (a single pass), freeing the bits of interest by hand and updating
 	 * the leaf corresponding to the dmap word. a single pass will be used
-	 * for all dmap words fully contained within the specified range.  
+	 * for all dmap words fully contained within the specified range.
 	 * within this pass, the bits of all fully contained dmap words will
 	 * be marked as free in a single shot and the leaves will be updated. a
 	 * single leaf may describe the free space of multiple dmap words,
@@ -2267,7 +2267,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 		 */
 		if (nb < DBWORD) {
 			/* free (zero) the appropriate bits within this
-			 * dmap word. 
+			 * dmap word.
 			 */
 			dp->wmap[word] &=
 			    cpu_to_le32(~(ONES << (DBWORD - nb)
@@ -2327,7 +2327,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 
 	BMAP_LOCK(bmp);
 
-	/* update the free count for the allocation group and 
+	/* update the free count for the allocation group and
 	 * map.
 	 */
 	agno = blkno >> bmp->db_agl2size;
@@ -2378,7 +2378,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
  *		or deallocation resulted in the root change.  this range
  *		is respresented by a single leaf of the current dmapctl
  *		and the leaf will be updated with this value, possibly
- *		causing a binary buddy system within the leaves to be 
+ *		causing a binary buddy system within the leaves to be
  *		split or joined.  the update may also cause the dmapctl's
  *		dmtree to be updated.
  *
@@ -2394,7 +2394,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
  *		   requires the dmap control page to be adjusted.
  *      newval	-  the new value of the lower level dmap or dmap control
  *		   page root.
- *      alloc	-  TRUE if adjustment is due to an allocation.
+ *      alloc	-  'true' if adjustment is due to an allocation.
  *      level	-  current level of dmap control page (i.e. L0, L1, L2) to
  *		   be adjusted.
  *
@@ -2590,7 +2590,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
 		}
 	}
 
-	/* adjust the dmap tree to reflect the specified leaf's new 
+	/* adjust the dmap tree to reflect the specified leaf's new
 	 * value.
 	 */
 	dbAdjTree(tp, leafno, newval);
@@ -2638,7 +2638,7 @@ static int dbBackSplit(dmtree_t * tp, int leafno)
 	/* the back split is accomplished by iteratively finding the leaf
 	 * that starts the buddy system that contains the specified leaf and
 	 * splitting that system in two.  this iteration continues until
-	 * the specified leaf becomes the start of a buddy system. 
+	 * the specified leaf becomes the start of a buddy system.
 	 *
 	 * determine maximum possible l2 size for the specified leaf.
 	 */
@@ -2853,7 +2853,7 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
  * NAME:	dbFindLeaf()
  *
  * FUNCTION:    search a dmtree_t for sufficient free blocks, returning
- *		the index of a leaf describing the free blocks if 
+ *		the index of a leaf describing the free blocks if
  *		sufficient free blocks are found.
  *
  *		the search starts at the top of the dmtree_t tree and
@@ -2869,7 +2869,7 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
  *
  * RETURN VALUES:
  *      0	- success
- *      -ENOSPC	- insufficient free blocks. 
+ *      -ENOSPC	- insufficient free blocks.
  */
 static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
 {
@@ -3090,7 +3090,7 @@ static int blkstol2(s64 nb)
 
 
 /*
- * NAME:    	dbAllocBottomUp()
+ * NAME:	dbAllocBottomUp()
  *
  * FUNCTION:	alloc the specified block range from the working block
  *		allocation map.
@@ -3241,7 +3241,7 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
 	BMAP_LOCK(bmp);
 
 	/* if this allocation group is completely free,
-	 * update the highest active allocation group number 
+	 * update the highest active allocation group number
 	 * if this allocation group is the new max.
 	 */
 	agno = blkno >> bmp->db_agl2size;
@@ -3273,7 +3273,7 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
  * NAME:	dbExtendFS()
  *
  * FUNCTION:	extend bmap from blkno for nblocks;
- * 		dbExtendFS() updates bmap ready for dbAllocBottomUp();
+ *		dbExtendFS() updates bmap ready for dbAllocBottomUp();
  *
  * L2
  *  |
@@ -3284,13 +3284,13 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
  *       d0,...,dn  d0,...,dn  d0,...,dn    d0,...,dn  d0,...,dn  d0,.,dm;
  * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm
  *
- * <---old---><----------------------------extend----------------------->   
+ * <---old---><----------------------------extend----------------------->
  */
 int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(ipbmap->i_sb);
 	int nbperpage = sbi->nbperpage;
-	int i, i0 = TRUE, j, j0 = TRUE, k, n;
+	int i, i0 = true, j, j0 = true, k, n;
 	s64 newsize;
 	s64 p;
 	struct metapage *mp, *l2mp, *l1mp = NULL, *l0mp = NULL;
@@ -3330,7 +3330,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
 	bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0;
 
 	/*
-	 *      reconfigure db_agfree[] 
+	 *      reconfigure db_agfree[]
 	 * from old AG configuration to new AG configuration;
 	 *
 	 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
@@ -3398,7 +3398,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
 			j = (blkno & (MAXL1SIZE - 1)) >> L2MAXL0SIZE;
 			l1leaf = l1dcp->stree + CTLLEAFIND + j;
 			p = BLKTOL0(blkno, sbi->l2nbperpage);
-			j0 = FALSE;
+			j0 = false;
 		} else {
 			/* assign/init L1 page */
 			l1mp = get_metapage(ipbmap, p, PSIZE, 0);
@@ -3432,7 +3432,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
 				l0leaf = l0dcp->stree + CTLLEAFIND + i;
 				p = BLKTODMAP(blkno,
 					      sbi->l2nbperpage);
-				i0 = FALSE;
+				i0 = false;
 			} else {
 				/* assign/init L0 page */
 				l0mp = get_metapage(ipbmap, p, PSIZE, 0);
@@ -3491,7 +3491,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
 			}	/* for each dmap in a L0 */
 
 			/*
-			 * build current L0 page from its leaves, and 
+			 * build current L0 page from its leaves, and
 			 * initialize corresponding parent L1 leaf
 			 */
 			*l1leaf = dbInitDmapCtl(l0dcp, 0, ++i);
@@ -3515,7 +3515,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
 		}		/* for each L0 in a L1 */
 
 		/*
-		 * build current L1 page from its leaves, and 
+		 * build current L1 page from its leaves, and
 		 * initialize corresponding parent L2 leaf
 		 */
 		*l2leaf = dbInitDmapCtl(l1dcp, 1, ++j);
@@ -3570,7 +3570,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
 	 *      finalize bmap control page
 	 */
 //finalize:
-	/* 
+	/*
 	 * compute db_agpref: preferred ag to allocate from
 	 * (the leftmost ag with average free space in it);
 	 */
@@ -3614,9 +3614,9 @@ void dbFinalizeBmap(struct inode *ipbmap)
 
 	/*
 	 * compute db_aglevel, db_agheigth, db_width, db_agstart:
-	 * an ag is covered in aglevel dmapctl summary tree, 
-	 * at agheight level height (from leaf) with agwidth number of nodes 
-	 * each, which starts at agstart index node of the smmary tree node 
+	 * an ag is covered in aglevel dmapctl summary tree,
+	 * at agheight level height (from leaf) with agwidth number of nodes
+	 * each, which starts at agstart index node of the smmary tree node
 	 * array;
 	 */
 	bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
@@ -3635,13 +3635,13 @@ void dbFinalizeBmap(struct inode *ipbmap)
 
 /*
  * NAME:	dbInitDmap()/ujfs_idmap_page()
- *                                                                    
+ *
  * FUNCTION:	initialize working/persistent bitmap of the dmap page
  *		for the specified number of blocks:
- *                                                                    
+ *
  *		at entry, the bitmaps had been initialized as free (ZEROS);
- *		The number of blocks will only account for the actually 
- *		existing blocks. Blocks which don't actually exist in 
+ *		The number of blocks will only account for the actually
+ *		existing blocks. Blocks which don't actually exist in
  *		the aggregate will be marked as allocated (ONES);
  *
  * PARAMETERS:
@@ -3677,7 +3677,7 @@ static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks)
 
 	/*
 	 * free the bits corresponding to the block range (ZEROS):
-	 * note: not all bits of the first and last words may be contained 
+	 * note: not all bits of the first and last words may be contained
 	 * within the block range.
 	 */
 	for (r = nblocks; r > 0; r -= nb, blkno += nb) {
@@ -3709,7 +3709,7 @@ static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks)
 	}
 
 	/*
-	 * mark bits following the range to be freed (non-existing 
+	 * mark bits following the range to be freed (non-existing
 	 * blocks) as allocated (ONES)
 	 */
 
@@ -3741,11 +3741,11 @@ static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks)
 
 /*
  * NAME:	dbInitDmapTree()/ujfs_complete_dmap()
- *                                                                    
+ *
  * FUNCTION:	initialize summary tree of the specified dmap:
  *
  *		at entry, bitmap of the dmap has been initialized;
- *                                                                    
+ *
  * PARAMETERS:
  *	dp	- dmap to complete
  *	blkno	- starting block number for this dmap
@@ -3769,7 +3769,7 @@ static int dbInitDmapTree(struct dmap * dp)
 
 	/* init each leaf from corresponding wmap word:
 	 * note: leaf is set to NOFREE(-1) if all blocks of corresponding
-	 * bitmap word are allocated. 
+	 * bitmap word are allocated.
 	 */
 	cp = tp->stree + le32_to_cpu(tp->leafidx);
 	for (i = 0; i < LPERDMAP; i++)
@@ -3782,10 +3782,10 @@ static int dbInitDmapTree(struct dmap * dp)
 
 /*
  * NAME:	dbInitTree()/ujfs_adjtree()
- *                                                                    
+ *
  * FUNCTION:	initialize binary buddy summary tree of a dmap or dmapctl.
  *
- *		at entry, the leaves of the tree has been initialized 
+ *		at entry, the leaves of the tree has been initialized
  *		from corresponding bitmap word or root of summary tree
  *		of the child control page;
  *		configure binary buddy system at the leaf level, then
@@ -3813,15 +3813,15 @@ static int dbInitTree(struct dmaptree * dtp)
 	/*
 	 * configure the leaf levevl into binary buddy system
 	 *
-	 * Try to combine buddies starting with a buddy size of 1 
-	 * (i.e. two leaves). At a buddy size of 1 two buddy leaves 
-	 * can be combined if both buddies have a maximum free of l2min; 
-	 * the combination will result in the left-most buddy leaf having 
-	 * a maximum free of l2min+1.  
-	 * After processing all buddies for a given size, process buddies 
-	 * at the next higher buddy size (i.e. current size * 2) and 
-	 * the next maximum free (current free + 1).  
-	 * This continues until the maximum possible buddy combination 
+	 * Try to combine buddies starting with a buddy size of 1
+	 * (i.e. two leaves). At a buddy size of 1 two buddy leaves
+	 * can be combined if both buddies have a maximum free of l2min;
+	 * the combination will result in the left-most buddy leaf having
+	 * a maximum free of l2min+1.
+	 * After processing all buddies for a given size, process buddies
+	 * at the next higher buddy size (i.e. current size * 2) and
+	 * the next maximum free (current free + 1).
+	 * This continues until the maximum possible buddy combination
 	 * yields maximum free.
 	 */
 	for (l2free = dtp->budmin, bsize = 1; l2free < l2max;
@@ -3845,10 +3845,10 @@ static int dbInitTree(struct dmaptree * dtp)
 	 * bubble summary information of leaves up the tree.
 	 *
 	 * Starting at the leaf node level, the four nodes described by
-	 * the higher level parent node are compared for a maximum free and 
-	 * this maximum becomes the value of the parent node.  
-	 * when all lower level nodes are processed in this fashion then 
-	 * move up to the next level (parent becomes a lower level node) and 
+	 * the higher level parent node are compared for a maximum free and
+	 * this maximum becomes the value of the parent node.
+	 * when all lower level nodes are processed in this fashion then
+	 * move up to the next level (parent becomes a lower level node) and
 	 * continue the process for that level.
 	 */
 	for (child = le32_to_cpu(dtp->leafidx),
@@ -3857,7 +3857,7 @@ static int dbInitTree(struct dmaptree * dtp)
 		/* get index of 1st node of parent level */
 		parent = (child - 1) >> 2;
 
-		/* set the value of the parent node as the maximum 
+		/* set the value of the parent node as the maximum
 		 * of the four nodes of the current level.
 		 */
 		for (i = 0, cp = tp + child, cp1 = tp + parent;
@@ -3885,8 +3885,8 @@ static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i)
 	dcp->budmin = L2BPERDMAP + L2LPERCTL * level;
 
 	/*
-	 * initialize the leaves of current level that were not covered 
-	 * by the specified input block range (i.e. the leaves have no 
+	 * initialize the leaves of current level that were not covered
+	 * by the specified input block range (i.e. the leaves have no
 	 * low level dmapctl or dmap).
 	 */
 	cp = &dcp->stree[CTLLEAFIND + i];
@@ -3900,9 +3900,9 @@ static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i)
 
 /*
  * NAME:	dbGetL2AGSize()/ujfs_getagl2size()
- *                                                                    
+ *
  * FUNCTION:	Determine log2(allocation group size) from aggregate size
- *                                                                    
+ *
  * PARAMETERS:
  *	nblocks	- Number of blocks in aggregate
  *
@@ -3935,8 +3935,8 @@ static int dbGetL2AGSize(s64 nblocks)
 
 /*
  * NAME:	dbMapFileSizeToMapSize()
- *                                                                    
- * FUNCTION:	compute number of blocks the block allocation map file 
+ *
+ * FUNCTION:	compute number of blocks the block allocation map file
  *		can cover from the map file size;
  *
  * RETURNS:	Number of blocks which can be covered by this block map file;
@@ -3968,7 +3968,7 @@ s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
 	npages = nblocks >> JFS_SBI(sb)->l2nbperpage;
 	level = BMAPPGTOLEV(npages);
 
-	/* At each level, accumulate the number of dmap pages covered by 
+	/* At each level, accumulate the number of dmap pages covered by
 	 * the number of full child levels below it;
 	 * repeat for the last incomplete child level.
 	 */
@@ -3990,7 +3990,7 @@ s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
 		npages--;
 	}
 
-	/* convert the number of dmaps into the number of blocks 
+	/* convert the number of dmaps into the number of blocks
 	 * which can be covered by the dmaps;
 	 */
 	nblocks = ndmaps << L2BPERDMAP;
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 8b14cc8e0228..45ea454c74bd 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -1,18 +1,18 @@
 /*
- *   Copyright (c) International Business Machines Corp., 2000-2002
+ *   Copyright (C) International Business Machines Corp., 2000-2002
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef	_H_JFS_DMAP
@@ -27,7 +27,7 @@
 #define L2LPERDMAP	8	/* l2 number of leaves per dmap tree */
 #define	DBWORD		32	/* # of blks covered by a map word */
 #define	L2DBWORD	5	/* l2 # of blks covered by a mword */
-#define BUDMIN  	L2DBWORD	/* max free string in a map word */
+#define BUDMIN		L2DBWORD	/* max free string in a map word */
 #define BPERDMAP	(LPERDMAP * DBWORD)	/* num of blks per dmap */
 #define L2BPERDMAP	13	/* l2 num of blks per dmap */
 #define CTLTREESIZE	(1024+256+64+16+4+1)	/* size of a dmapctl tree */
@@ -57,7 +57,7 @@
 
 #define	MAXMAPSIZE	MAXL2SIZE	/* maximum aggregate map size */
 
-/* 
+/*
  * determine the maximum free string for four (lower level) nodes
  * of the tree.
  */
@@ -122,7 +122,7 @@ static __inline signed char TREEMAX(signed char *cp)
 #define BLKTOCTL(b,s,l)   \
         (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s)))
 
-/* 
+/*
  * convert aggregate map size to the zero origin dmapctl level of the
  * top dmapctl.
  */
@@ -192,13 +192,13 @@ typedef union dmtree {
 
 /* macros for accessing fields within dmtree */
 #define	dmt_nleafs	t1.nleafs
-#define	dmt_l2nleafs 	t1.l2nleafs
-#define	dmt_leafidx 	t1.leafidx
-#define	dmt_height 	t1.height
-#define	dmt_budmin 	t1.budmin
-#define	dmt_stree 	t1.stree
+#define	dmt_l2nleafs	t1.l2nleafs
+#define	dmt_leafidx	t1.leafidx
+#define	dmt_height	t1.height
+#define	dmt_budmin	t1.budmin
+#define	dmt_stree	t1.stree
 
-/* 
+/*
  *	on-disk aggregate disk allocation map descriptor.
  */
 struct dbmap_disk {
@@ -237,7 +237,7 @@ struct dbmap {
 	s64 dn_agsize;		/* num of blks per alloc group       */
 	signed char dn_maxfreebud;	/* max free buddy system             */
 };				/* - 4096 -                             */
-/* 
+/*
  *	in-memory aggregate disk allocation map descriptor.
  */
 struct bmap {
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 6c3f08319846..ecb2216d881c 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -3,16 +3,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
@@ -78,7 +78,7 @@
  *
  * case-insensitive search:
  *
- * 	fold search key;
+ *	fold search key;
  *
  *	case-insensitive search of B-tree:
  *	for internal entry, router key is already folded;
@@ -93,7 +93,7 @@
  *	else
  *		return no match;
  *
- * 	serialization:
+ *	serialization:
  * target directory inode lock is being held on entry/exit
  * of all main directory service routines.
  *
@@ -925,7 +925,7 @@ int dtInsert(tid_t tid, struct inode *ip,
  *
  * return: 0 - success;
  *	   errno - failure;
- * 	leaf page unpinned;
+ *	leaf page unpinned;
  */
 static int dtSplitUp(tid_t tid,
 	  struct inode *ip, struct dtsplit * split, struct btstack * btstack)
@@ -3767,7 +3767,7 @@ static int ciCompare(struct component_name * key,	/* search key */
  *	     across page boundary
  *
  * return: non-zero on error
- *	
+ *
  */
 static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
 			       int ri, struct component_name * key, int flag)
@@ -3780,13 +3780,13 @@ static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp,
 	lkey.name = (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
 					GFP_KERNEL);
 	if (lkey.name == NULL)
-		return -ENOSPC;
+		return -ENOMEM;
 
 	rkey.name = (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t),
 					GFP_KERNEL);
 	if (rkey.name == NULL) {
 		kfree(lkey.name);
-		return -ENOSPC;
+		return -ENOMEM;
 	}
 
 	/* get left and right key */
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index 13e4fdf07724..af8513f78648 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -1,18 +1,18 @@
 /*
- *   Copyright (c) International Business Machines Corp., 2000-2002
+ *   Copyright (C) International Business Machines Corp., 2000-2002
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_DTREE
@@ -80,7 +80,7 @@ struct idtentry {
 /*
  *	leaf node entry head/only segment
  *
- * 	For legacy filesystems, name contains 13 wchars -- no index field
+ *	For legacy filesystems, name contains 13 wchars -- no index field
  */
 struct ldtentry {
 	__le32 inumber;		/* 4: 4-byte aligned */
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 4d52593a5fc6..a35bdca6a805 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -3,16 +3,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
@@ -74,7 +74,7 @@ static s64 extRoundDown(s64 nb);
  *		  extent that is used as an allocation hint if the
  *		  xaddr of the xad is non-zero.  on successful exit,
  *		  the xad describes the newly allocated extent.
- *	abnr	- boolean_t indicating whether the newly allocated extent
+ *	abnr	- bool indicating whether the newly allocated extent
  *		  should be marked as allocated but not recorded.
  *
  * RETURN VALUES:
@@ -83,7 +83,7 @@ static s64 extRoundDown(s64 nb);
  *      -ENOSPC	- insufficient disk resources.
  */
 int
-extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
+extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
 	s64 nxlen, nxaddr, xoff, hint, xaddr = 0;
@@ -117,7 +117,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
 		 * following the hint extent.
 		 */
 		if (offsetXAD(xp) + nxlen == xoff &&
-		    abnr == ((xp->flag & XAD_NOTRECORDED) ? TRUE : FALSE))
+		    abnr == ((xp->flag & XAD_NOTRECORDED) ? true : false))
 			xaddr = hint + nxlen;
 
 		/* adjust the hint to the last block of the extent */
@@ -125,7 +125,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
 	}
 
 	/* allocate the disk blocks for the extent.  initially, extBalloc()
-	 * will try to allocate disk blocks for the requested size (xlen). 
+	 * will try to allocate disk blocks for the requested size (xlen).
 	 * if this fails (xlen contiguous free blocks not avaliable), it'll
 	 * try to allocate a smaller number of blocks (producing a smaller
 	 * extent), with this smaller number of blocks consisting of the
@@ -148,9 +148,9 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
 	}
 
 	/* determine the value of the extent flag */
-	xflag = (abnr == TRUE) ? XAD_NOTRECORDED : 0;
+	xflag = abnr ? XAD_NOTRECORDED : 0;
 
-	/* if we can extend the hint extent to cover the current request, 
+	/* if we can extend the hint extent to cover the current request,
 	 * extend it.  otherwise, insert a new extent to
 	 * cover the current request.
 	 */
@@ -159,7 +159,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
 	else
 		rc = xtInsert(0, ip, xflag, xoff, (int) nxlen, &nxaddr, 0);
 
-	/* if the extend or insert failed, 
+	/* if the extend or insert failed,
 	 * free the newly allocated blocks and return the error.
 	 */
 	if (rc) {
@@ -203,7 +203,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
  *	xlen	- request size of the resulting extent.
  *	xp	- pointer to an xad. on successful exit, the xad
  *		  describes the newly allocated extent.
- *	abnr	- boolean_t indicating whether the newly allocated extent
+ *	abnr	- bool indicating whether the newly allocated extent
  *		  should be marked as allocated but not recorded.
  *
  * RETURN VALUES:
@@ -211,7 +211,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
  *      -EIO	- i/o error.
  *      -ENOSPC	- insufficient disk resources.
  */
-int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, boolean_t abnr)
+int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
 {
 	struct super_block *sb = ip->i_sb;
 	s64 xaddr, xlen, nxaddr, delta, xoff;
@@ -235,7 +235,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, boolean_t abnr)
 	xoff = offsetXAD(xp);
 
 	/* if the extend page is abnr and if the request is for
-	 * the extent to be allocated and recorded, 
+	 * the extent to be allocated and recorded,
 	 * make the page allocated and recorded.
 	 */
 	if ((xp->flag & XAD_NOTRECORDED) && !abnr) {
@@ -397,7 +397,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
 	if ((rc = xtLookupList(ip, &lxdl, &xadl, 0)))
 		return (rc);
 
-	/* check if not extent exists for the previous page.  
+	/* check if not extent exists for the previous page.
 	 * this is possible for sparse files.
 	 */
 	if (xadl.nxad == 0) {
@@ -410,7 +410,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp)
 	 */
 	xp->flag &= XAD_NOTRECORDED;
 
-        if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) {          
+        if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) {
 		jfs_error(ip->i_sb, "extHint: corrupt xtree");
 		return -EIO;
         }
@@ -468,7 +468,7 @@ int extRecord(struct inode *ip, xad_t * xp)
 int extFill(struct inode *ip, xad_t * xp)
 {
 	int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage;
-	s64 blkno = offsetXAD(xp) >> ip->i_blksize;
+	s64 blkno = offsetXAD(xp) >> ip->i_blkbits;
 
 //      assert(ISSPARSE(ip));
 
@@ -476,7 +476,7 @@ int extFill(struct inode *ip, xad_t * xp)
 	XADaddress(xp, 0);
 
 	/* allocate an extent to fill the hole */
-	if ((rc = extAlloc(ip, nbperpage, blkno, xp, FALSE)))
+	if ((rc = extAlloc(ip, nbperpage, blkno, xp, false)))
 		return (rc);
 
 	assert(lengthPXD(xp) == nbperpage);
@@ -492,7 +492,7 @@ int extFill(struct inode *ip, xad_t * xp)
  * FUNCTION:    allocate disk blocks to form an extent.
  *
  *		initially, we will try to allocate disk blocks for the
- *		requested size (nblocks).  if this fails (nblocks 
+ *		requested size (nblocks).  if this fails (nblocks
  *		contiguous free blocks not avaliable), we'll try to allocate
  *		a smaller number of blocks (producing a smaller extent), with
  *		this smaller number of blocks consisting of the requested
@@ -500,7 +500,7 @@ int extFill(struct inode *ip, xad_t * xp)
  *		number (i.e. 16 -> 8).  we'll continue to round down and
  *		retry the allocation until the number of blocks to allocate
  *		is smaller than the number of blocks per page.
- *		
+ *
  * PARAMETERS:
  *	ip	 - the inode of the file.
  *	hint	 - disk block number to be used as an allocation hint.
@@ -509,7 +509,7 @@ int extFill(struct inode *ip, xad_t * xp)
  *		   exit, this value is set to the number of blocks actually
  *		   allocated.
  *	blkno	 - pointer to a block address that is filled in on successful
- *		   return with the starting block number of the newly 
+ *		   return with the starting block number of the newly
  *		   allocated block range.
  *
  * RETURN VALUES:
@@ -530,7 +530,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
 	/* get the number of blocks to initially attempt to allocate.
 	 * we'll first try the number of blocks requested unless this
 	 * number is greater than the maximum number of contiguous free
-	 * blocks in the map. in that case, we'll start off with the 
+	 * blocks in the map. in that case, we'll start off with the
 	 * maximum free.
 	 */
 	max = (s64) 1 << bmp->db_maxfreebud;
@@ -582,19 +582,19 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
  *
  * FUNCTION:    attempt to extend an extent's allocation.
  *
- *		initially, we will try to extend the extent's allocation
- *		in place.  if this fails, we'll try to move the extent
- *		to a new set of blocks. if moving the extent, we initially
+ *		Initially, we will try to extend the extent's allocation
+ *		in place.  If this fails, we'll try to move the extent
+ *		to a new set of blocks.  If moving the extent, we initially
  *		will try to allocate disk blocks for the requested size
- *		(nnew).  if this fails 	(new contiguous free blocks not
- *		avaliable), we'll try  to allocate a smaller number of
+ *		(newnblks).  if this fails (new contiguous free blocks not
+ *		avaliable), we'll try to allocate a smaller number of
  *		blocks (producing a smaller extent), with this smaller
  *		number of blocks consisting of the requested number of
  *		blocks rounded down to the next smaller power of 2
- *		number (i.e. 16 -> 8).  we'll continue to round down and
+ *		number (i.e. 16 -> 8).  We'll continue to round down and
  *		retry the allocation until the number of blocks to allocate
  *		is smaller than the number of blocks per page.
- *		
+ *
  * PARAMETERS:
  *	ip	 - the inode of the file.
  *	blkno    - starting block number of the extents current allocation.
@@ -625,7 +625,7 @@ extBrealloc(struct inode *ip,
 			return (rc);
 	}
 
-	/* in place extension not possible.  
+	/* in place extension not possible.
 	 * try to move the extent to a new set of blocks.
 	 */
 	return (extBalloc(ip, blkno, newnblks, newblkno));
diff --git a/fs/jfs/jfs_extent.h b/fs/jfs/jfs_extent.h
index e80fc7ced87d..b567e12c52d3 100644
--- a/fs/jfs/jfs_extent.h
+++ b/fs/jfs/jfs_extent.h
@@ -1,18 +1,18 @@
 /*
- *   Copyright (c) International Business Machines Corp., 2000-2001
+ *   Copyright (C) International Business Machines Corp., 2000-2001
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef	_H_JFS_EXTENT
@@ -22,10 +22,10 @@
 #define	INOHINT(ip)	\
 	(addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1)
 
-extern int	extAlloc(struct inode *, s64, s64, xad_t *, boolean_t);
+extern int	extAlloc(struct inode *, s64, s64, xad_t *, bool);
 extern int	extFill(struct inode *, xad_t *);
 extern int	extHint(struct inode *, s64, xad_t *);
-extern int	extRealloc(struct inode *, s64, xad_t *, boolean_t);
+extern int	extRealloc(struct inode *, s64, xad_t *, bool);
 extern int	extRecord(struct inode *, xad_t *);
 
 #endif	/* _H_JFS_EXTENT */
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
index 72a5588faeca..9901928668cf 100644
--- a/fs/jfs/jfs_filsys.h
+++ b/fs/jfs/jfs_filsys.h
@@ -3,16 +3,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_FILSYS
@@ -21,9 +21,9 @@
 /*
  *	jfs_filsys.h
  *
- * file system (implementation-dependent) constants 
+ * file system (implementation-dependent) constants
  *
- * refer to <limits.h> for system wide implementation-dependent constants 
+ * refer to <limits.h> for system wide implementation-dependent constants
  */
 
 /*
@@ -49,7 +49,7 @@
 
 #define JFS_DFS		0x20000000	/* DCE DFS LFS support */
 
-#define JFS_LINUX      	0x10000000	/* Linux support */
+#define JFS_LINUX	0x10000000	/* Linux support */
 /*	case-sensitive name/directory support */
 
 /* directory option */
@@ -59,7 +59,7 @@
 #define	JFS_COMMIT	0x00000f00	/* commit option mask */
 #define	JFS_GROUPCOMMIT	0x00000100	/* group (of 1) commit */
 #define	JFS_LAZYCOMMIT	0x00000200	/* lazy commit */
-#define	JFS_TMPFS	0x00000400	/* temporary file system - 
+#define	JFS_TMPFS	0x00000400	/* temporary file system -
 					 * do not log/commit:
 					 */
 
@@ -196,7 +196,7 @@
 					 * followed by 1st extent of map
 					 */
 #define AITBL_OFF	(AIMAP_OFF + (SIZE_OF_MAP_PAGE << 1))
-					/* 
+					/*
 					 * 1st extent of aggregate inode table
 					 */
 #define SUPER2_OFF	(AITBL_OFF + INODE_EXTENT_SIZE)
@@ -270,13 +270,13 @@
  */
 #define FM_CLEAN 0x00000000	/* file system is unmounted and clean */
 #define FM_MOUNT 0x00000001	/* file system is mounted cleanly */
-#define FM_DIRTY 0x00000002	/* file system was not unmounted and clean 
-				 * when mounted or 
+#define FM_DIRTY 0x00000002	/* file system was not unmounted and clean
+				 * when mounted or
 				 * commit failure occurred while being mounted:
-				 * fsck() must be run to repair 
+				 * fsck() must be run to repair
 				 */
 #define	FM_LOGREDO 0x00000004	/* log based recovery (logredo()) failed:
-				 * fsck() must be run to repair 
+				 * fsck() must be run to repair
 				 */
 #define	FM_EXTENDFS 0x00000008	/* file system extendfs() in progress */
 
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index ccbe60aff83d..489a3d63002d 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -3,16 +3,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
@@ -78,8 +78,8 @@ static HLIST_HEAD(aggregate_hash);
 /*
  * forward references
  */
-static int diAllocAG(struct inomap *, int, boolean_t, struct inode *);
-static int diAllocAny(struct inomap *, int, boolean_t, struct inode *);
+static int diAllocAG(struct inomap *, int, bool, struct inode *);
+static int diAllocAny(struct inomap *, int, bool, struct inode *);
 static int diAllocBit(struct inomap *, struct iag *, int);
 static int diAllocExt(struct inomap *, int, struct inode *);
 static int diAllocIno(struct inomap *, int, struct inode *);
@@ -98,7 +98,7 @@ static void copy_to_dinode(struct dinode *, struct inode *);
  * FUNCTION:    initialize the incore inode map control structures for
  *		a fileset or aggregate init time.
  *
- *              the inode map's control structure (dinomap) is 
+ *              the inode map's control structure (dinomap) is
  *              brought in from disk and placed in virtual memory.
  *
  * PARAMETERS:
@@ -107,7 +107,7 @@ static void copy_to_dinode(struct dinode *, struct inode *);
  * RETURN VALUES:
  *      0       - success
  *      -ENOMEM  - insufficient free virtual memory.
- *      -EIO  	- i/o error.
+ *      -EIO	- i/o error.
  */
 int diMount(struct inode *ipimap)
 {
@@ -191,7 +191,7 @@ int diMount(struct inode *ipimap)
  * RETURN VALUES:
  *      0       - success
  *      -ENOMEM  - insufficient free virtual memory.
- *      -EIO  	- i/o error.
+ *      -EIO	- i/o error.
  */
 int diUnmount(struct inode *ipimap, int mounterror)
 {
@@ -281,7 +281,7 @@ int diSync(struct inode *ipimap)
  *		on entry, the specifed incore inode should itself
  *		specify the disk inode number corresponding to the
  *		incore inode (i.e. i_number should be initialized).
- *		
+ *
  *		this routine handles incore inode initialization for
  *		both "special" and "regular" inodes.  special inodes
  *		are those required early in the mount process and
@@ -289,7 +289,7 @@ int diSync(struct inode *ipimap)
  *		is not yet initialized.  these "special" inodes are
  *		identified by a NULL inode map inode pointer and are
  *		actually initialized by a call to diReadSpecial().
- *		
+ *
  *		for regular inodes, the iag describing the disk inode
  *		is read from disk to determine the inode extent address
  *		for the disk inode.  with the inode extent address in
@@ -302,9 +302,9 @@ int diSync(struct inode *ipimap)
  *
  * RETURN VALUES:
  *      0       - success
- *      -EIO  	- i/o error.
+ *      -EIO	- i/o error.
  *      -ENOMEM	- insufficient memory
- *      
+ *
  */
 int diRead(struct inode *ip)
 {
@@ -586,14 +586,14 @@ void diFreeSpecial(struct inode *ip)
  *		page of the extent that contains the disk inode is
  *		read and the disk inode portion of the incore inode
  *		is copied to the disk inode.
- *		
+ *
  * PARAMETERS:
  *	tid -  transacation id
  *      ip  -  pointer to incore inode to be written to the inode extent.
  *
  * RETURN VALUES:
  *      0       - success
- *      -EIO  	- i/o error.
+ *      -EIO	- i/o error.
  */
 int diWrite(tid_t tid, struct inode *ip)
 {
@@ -676,11 +676,11 @@ int diWrite(tid_t tid, struct inode *ip)
 	 * copy btree root from in-memory inode to on-disk inode
 	 *
 	 * (tlock is taken from inline B+-tree root in in-memory
-	 * inode when the B+-tree root is updated, which is pointed 
+	 * inode when the B+-tree root is updated, which is pointed
 	 * by jfs_ip->blid as well as being on tx tlock list)
 	 *
-	 * further processing of btree root is based on the copy 
-	 * in in-memory inode, where txLog() will log from, and, 
+	 * further processing of btree root is based on the copy
+	 * in in-memory inode, where txLog() will log from, and,
 	 * for xtree root, txUpdateMap() will update map and reset
 	 * XAD_NEW bit;
 	 */
@@ -824,7 +824,7 @@ int diWrite(tid_t tid, struct inode *ip)
 		memcpy(&dp->di_DASD, &ip->i_DASD, sizeof(struct dasd));
 #endif				/*  _JFS_FASTDASD */
 
-	/* release the buffer holding the updated on-disk inode. 
+	/* release the buffer holding the updated on-disk inode.
 	 * the buffer will be later written by commit processing.
 	 */
 	write_metapage(mp);
@@ -842,7 +842,7 @@ int diWrite(tid_t tid, struct inode *ip)
  *		if the inode to be freed represents the first (only)
  *		free inode within the iag, the iag will be placed on
  *		the ag free inode list.
- *	
+ *
  *		freeing the inode will cause the inode extent to be
  *		freed if the inode is the only allocated inode within
  *		the extent.  in this case all the disk resource backing
@@ -865,11 +865,11 @@ int diWrite(tid_t tid, struct inode *ip)
  *		any updates and are held until all updates are complete.
  *
  * PARAMETERS:
- *      ip  	- inode to be freed.
+ *      ip	- inode to be freed.
  *
  * RETURN VALUES:
  *      0       - success
- *      -EIO  	- i/o error.
+ *      -EIO	- i/o error.
  */
 int diFree(struct inode *ip)
 {
@@ -898,7 +898,7 @@ int diFree(struct inode *ip)
 	 */
 	iagno = INOTOIAG(inum);
 
-	/* make sure that the iag is contained within 
+	/* make sure that the iag is contained within
 	 * the map.
 	 */
 	if (iagno >= imap->im_nextiag) {
@@ -1013,7 +1013,7 @@ int diFree(struct inode *ip)
 
 		/* update the free inode summary map for the extent if
 		 * freeing the inode means the extent will now have free
-		 * inodes (i.e., the inode being freed is the first free 
+		 * inodes (i.e., the inode being freed is the first free
 		 * inode of extent),
 		 */
 		if (iagp->wmap[extno] == cpu_to_le32(ONES)) {
@@ -1204,9 +1204,9 @@ int diFree(struct inode *ip)
 		iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1);
 	}
 
-	/* update the inode extent address and working map 
+	/* update the inode extent address and working map
 	 * to reflect the free extent.
-	 * the permanent map should have been updated already 
+	 * the permanent map should have been updated already
 	 * for the inode being freed.
 	 */
 	if (iagp->pmap[extno] != 0) {
@@ -1218,7 +1218,7 @@ int diFree(struct inode *ip)
 
 	/* update the free extent and free inode summary maps
 	 * to reflect the freed extent.
-	 * the inode summary map is marked to indicate no inodes 
+	 * the inode summary map is marked to indicate no inodes
 	 * available for the freed extent.
 	 */
 	sword = extno >> L2EXTSPERSUM;
@@ -1255,17 +1255,17 @@ int diFree(struct inode *ip)
 	 * start transaction to update block allocation map
 	 * for the inode extent freed;
 	 *
-	 * N.B. AG_LOCK is released and iag will be released below, and 
+	 * N.B. AG_LOCK is released and iag will be released below, and
 	 * other thread may allocate inode from/reusing the ixad freed
-	 * BUT with new/different backing inode extent from the extent 
-	 * to be freed by the transaction;  
+	 * BUT with new/different backing inode extent from the extent
+	 * to be freed by the transaction;
 	 */
 	tid = txBegin(ipimap->i_sb, COMMIT_FORCE);
 	mutex_lock(&JFS_IP(ipimap)->commit_mutex);
 
-	/* acquire tlock of the iag page of the freed ixad 
+	/* acquire tlock of the iag page of the freed ixad
 	 * to force the page NOHOMEOK (even though no data is
-	 * logged from the iag page) until NOREDOPAGE|FREEXTENT log 
+	 * logged from the iag page) until NOREDOPAGE|FREEXTENT log
 	 * for the free of the extent is committed;
 	 * write FREEXTENT|NOREDOPAGE log record
 	 * N.B. linelock is overlaid as freed extent descriptor;
@@ -1284,8 +1284,8 @@ int diFree(struct inode *ip)
 	 * logredo needs the IAG number and IAG extent index in order
 	 * to ensure that the IMap is consistent.  The least disruptive
 	 * way to pass these values through  to the transaction manager
-	 * is in the iplist array.  
-	 * 
+	 * is in the iplist array.
+	 *
 	 * It's not pretty, but it works.
 	 */
 	iplist[1] = (struct inode *) (size_t)iagno;
@@ -1340,20 +1340,20 @@ diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp)
 /*
  * NAME:        diAlloc(pip,dir,ip)
  *
- * FUNCTION:    allocate a disk inode from the inode working map 
+ * FUNCTION:    allocate a disk inode from the inode working map
  *		for a fileset or aggregate.
  *
  * PARAMETERS:
- *      pip  	- pointer to incore inode for the parent inode.
- *      dir  	- TRUE if the new disk inode is for a directory.
- *      ip  	- pointer to a new inode
+ *      pip	- pointer to incore inode for the parent inode.
+ *      dir	- 'true' if the new disk inode is for a directory.
+ *      ip	- pointer to a new inode
  *
  * RETURN VALUES:
  *      0       - success.
  *      -ENOSPC	- insufficient disk resources.
- *      -EIO  	- i/o error.
+ *      -EIO	- i/o error.
  */
-int diAlloc(struct inode *pip, boolean_t dir, struct inode *ip)
+int diAlloc(struct inode *pip, bool dir, struct inode *ip)
 {
 	int rc, ino, iagno, addext, extno, bitno, sword;
 	int nwords, rem, i, agno;
@@ -1372,10 +1372,10 @@ int diAlloc(struct inode *pip, boolean_t dir, struct inode *ip)
 	JFS_IP(ip)->ipimap = ipimap;
 	JFS_IP(ip)->fileset = FILESYSTEM_I;
 
-	/* for a directory, the allocation policy is to start 
+	/* for a directory, the allocation policy is to start
 	 * at the ag level using the preferred ag.
 	 */
-	if (dir == TRUE) {
+	if (dir) {
 		agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap);
 		AG_LOCK(imap, agno);
 		goto tryag;
@@ -1435,7 +1435,7 @@ int diAlloc(struct inode *pip, boolean_t dir, struct inode *ip)
 	/*
 	 *      try to allocate from the IAG
 	 */
-	/* check if the inode may be allocated from the iag 
+	/* check if the inode may be allocated from the iag
 	 * (i.e. the inode has free inodes or new extent can be added).
 	 */
 	if (iagp->nfreeinos || addext) {
@@ -1490,7 +1490,7 @@ int diAlloc(struct inode *pip, boolean_t dir, struct inode *ip)
 		 * hint or, if appropriate (i.e. addext is true), allocate
 		 * an extent of free inodes at or following the extent
 		 * containing the hint.
-		 * 
+		 *
 		 * the free inode and free extent summary maps are used
 		 * here, so determine the starting summary map position
 		 * and the number of words we'll have to examine.  again,
@@ -1641,7 +1641,7 @@ int diAlloc(struct inode *pip, boolean_t dir, struct inode *ip)
  *		inodes should be added for the allocation group, with
  *		the current request satisfied from this extent. if this
  *		is the case, an attempt will be made to do just that.  if
- *		this attempt fails or it has been determined that a new 
+ *		this attempt fails or it has been determined that a new
  *		extent should not be added, an attempt is made to satisfy
  *		the request by allocating an existing (backed) free inode
  *		from the allocation group.
@@ -1649,24 +1649,24 @@ int diAlloc(struct inode *pip, boolean_t dir, struct inode *ip)
  * PRE CONDITION: Already have the AG lock for this AG.
  *
  * PARAMETERS:
- *      imap  	- pointer to inode map control structure.
- *      agno  	- allocation group to allocate from.
- *      dir  	- TRUE if the new disk inode is for a directory.
- *      ip  	- pointer to the new inode to be filled in on successful return
+ *      imap	- pointer to inode map control structure.
+ *      agno	- allocation group to allocate from.
+ *      dir	- 'true' if the new disk inode is for a directory.
+ *      ip	- pointer to the new inode to be filled in on successful return
  *		  with the disk inode number allocated, its extent address
  *		  and the start of the ag.
  *
  * RETURN VALUES:
  *      0       - success.
  *      -ENOSPC	- insufficient disk resources.
- *      -EIO  	- i/o error.
+ *      -EIO	- i/o error.
  */
 static int
-diAllocAG(struct inomap * imap, int agno, boolean_t dir, struct inode *ip)
+diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
 {
 	int rc, addext, numfree, numinos;
 
-	/* get the number of free and the number of backed disk 
+	/* get the number of free and the number of backed disk
 	 * inodes currently within the ag.
 	 */
 	numfree = imap->im_agctl[agno].numfree;
@@ -1682,7 +1682,7 @@ diAllocAG(struct inomap * imap, int agno, boolean_t dir, struct inode *ip)
 	 * if there are a small number of free inodes or number of free
 	 * inodes is a small percentage of the number of backed inodes.
 	 */
-	if (dir == TRUE)
+	if (dir)
 		addext = (numfree < 64 ||
 			  (numfree < 256
 			   && ((numfree * 100) / numinos) <= 20));
@@ -1719,26 +1719,26 @@ diAllocAG(struct inomap * imap, int agno, boolean_t dir, struct inode *ip)
  *		specified primary group.
  *
  * PARAMETERS:
- *      imap  	- pointer to inode map control structure.
- *      agno  	- primary allocation group (to avoid).
- *      dir  	- TRUE if the new disk inode is for a directory.
- *      ip  	- pointer to a new inode to be filled in on successful return
+ *      imap	- pointer to inode map control structure.
+ *      agno	- primary allocation group (to avoid).
+ *      dir	- 'true' if the new disk inode is for a directory.
+ *      ip	- pointer to a new inode to be filled in on successful return
  *		  with the disk inode number allocated, its extent address
  *		  and the start of the ag.
  *
  * RETURN VALUES:
  *      0       - success.
  *      -ENOSPC	- insufficient disk resources.
- *      -EIO  	- i/o error.
+ *      -EIO	- i/o error.
  */
 static int
-diAllocAny(struct inomap * imap, int agno, boolean_t dir, struct inode *ip)
+diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip)
 {
 	int ag, rc;
 	int maxag = JFS_SBI(imap->im_ipimap->i_sb)->bmap->db_maxag;
 
 
-	/* try to allocate from the ags following agno up to 
+	/* try to allocate from the ags following agno up to
 	 * the maximum ag number.
 	 */
 	for (ag = agno + 1; ag <= maxag; ag++) {
@@ -1780,21 +1780,21 @@ diAllocAny(struct inomap * imap, int agno, boolean_t dir, struct inode *ip)
  *
  *		allocation occurs from the first iag on the list using
  *		the iag's free inode summary map to find the leftmost
- *		free inode in the iag. 
- *		
+ *		free inode in the iag.
+ *
  * PRE CONDITION: Already have AG lock for this AG.
- *		
+ *
  * PARAMETERS:
- *      imap  	- pointer to inode map control structure.
- *      agno  	- allocation group.
- *      ip  	- pointer to new inode to be filled in on successful return
+ *      imap	- pointer to inode map control structure.
+ *      agno	- allocation group.
+ *      ip	- pointer to new inode to be filled in on successful return
  *		  with the disk inode number allocated, its extent address
  *		  and the start of the ag.
  *
  * RETURN VALUES:
  *      0       - success.
  *      -ENOSPC	- insufficient disk resources.
- *      -EIO  	- i/o error.
+ *      -EIO	- i/o error.
  */
 static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
 {
@@ -1867,7 +1867,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
 		return -EIO;
 	}
 
-	/* compute the inode number within the iag. 
+	/* compute the inode number within the iag.
 	 */
 	ino = (extno << L2INOSPEREXT) + rem;
 
@@ -1892,17 +1892,17 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
 /*
  * NAME:        diAllocExt(imap,agno,ip)
  *
- * FUNCTION:   	add a new extent of free inodes to an iag, allocating
- *	       	an inode from this extent to satisfy the current allocation
- *	       	request.
- *		
+ * FUNCTION:	add a new extent of free inodes to an iag, allocating
+ *		an inode from this extent to satisfy the current allocation
+ *		request.
+ *
  *		this routine first tries to find an existing iag with free
  *		extents through the ag free extent list.  if list is not
  *		empty, the head of the list will be selected as the home
  *		of the new extent of free inodes.  otherwise (the list is
  *		empty), a new iag will be allocated for the ag to contain
  *		the extent.
- *		
+ *
  *		once an iag has been selected, the free extent summary map
  *		is used to locate a free extent within the iag and diNewExt()
  *		is called to initialize the extent, with initialization
@@ -1910,16 +1910,16 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip)
  *		for the purpose of satisfying this request.
  *
  * PARAMETERS:
- *      imap  	- pointer to inode map control structure.
- *      agno  	- allocation group number.
- *      ip  	- pointer to new inode to be filled in on successful return
+ *      imap	- pointer to inode map control structure.
+ *      agno	- allocation group number.
+ *      ip	- pointer to new inode to be filled in on successful return
  *		  with the disk inode number allocated, its extent address
  *		  and the start of the ag.
  *
  * RETURN VALUES:
  *      0       - success.
  *      -ENOSPC	- insufficient disk resources.
- *      -EIO  	- i/o error.
+ *      -EIO	- i/o error.
  */
 static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
 {
@@ -2012,7 +2012,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
 /*
  * NAME:        diAllocBit(imap,iagp,ino)
  *
- * FUNCTION:   	allocate a backed inode from an iag.
+ * FUNCTION:	allocate a backed inode from an iag.
  *
  *		this routine performs the mechanics of allocating a
  *		specified inode from a backed extent.
@@ -2025,19 +2025,19 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip)
  *		in the face of updates to multiple buffers.  under this
  *		approach, all required buffers are obtained before making
  *		any updates and are held all are updates are complete.
- *		
+ *
  * PRE CONDITION: Already have buffer lock on iagp.  Already have AG lock on
  *	this AG.  Must have read lock on imap inode.
  *
  * PARAMETERS:
- *      imap  	- pointer to inode map control structure.
- *      iagp  	- pointer to iag. 
- *      ino   	- inode number to be allocated within the iag.
+ *      imap	- pointer to inode map control structure.
+ *      iagp	- pointer to iag.
+ *      ino	- inode number to be allocated within the iag.
  *
  * RETURN VALUES:
  *      0       - success.
  *      -ENOSPC	- insufficient disk resources.
- *      -EIO  	- i/o error.
+ *      -EIO	- i/o error.
  */
 static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
 {
@@ -2172,19 +2172,19 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
  *		buffers.  under this approach, all required buffers are
  *		obtained before making any updates and are held until all
  *		updates are complete.
- *		
+ *
  * PRE CONDITION: Already have buffer lock on iagp.  Already have AG lock on
  *	this AG.  Must have read lock on imap inode.
  *
  * PARAMETERS:
- *      imap  	- pointer to inode map control structure.
- *      iagp  	- pointer to iag. 
- *      extno  	- extent number.
+ *      imap	- pointer to inode map control structure.
+ *      iagp	- pointer to iag.
+ *      extno	- extent number.
  *
  * RETURN VALUES:
  *      0       - success.
  *      -ENOSPC	- insufficient disk resources.
- *      -EIO  	- i/o error.
+ *      -EIO	- i/o error.
  */
 static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
 {
@@ -2432,34 +2432,34 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
 /*
  * NAME:        diNewIAG(imap,iagnop,agno)
  *
- * FUNCTION:   	allocate a new iag for an allocation group.
- *		
- *		first tries to allocate the iag from the inode map 
- *		iagfree list:  
- *		if the list has free iags, the head of the list is removed 
+ * FUNCTION:	allocate a new iag for an allocation group.
+ *
+ *		first tries to allocate the iag from the inode map
+ *		iagfree list:
+ *		if the list has free iags, the head of the list is removed
  *		and returned to satisfy the request.
  *		if the inode map's iag free list is empty, the inode map
  *		is extended to hold a new iag. this new iag is initialized
  *		and returned to satisfy the request.
  *
  * PARAMETERS:
- *      imap  	- pointer to inode map control structure.
- *      iagnop 	- pointer to an iag number set with the number of the
+ *      imap	- pointer to inode map control structure.
+ *      iagnop	- pointer to an iag number set with the number of the
  *		  newly allocated iag upon successful return.
- *      agno  	- allocation group number.
+ *      agno	- allocation group number.
  *	bpp	- Buffer pointer to be filled in with new IAG's buffer
  *
  * RETURN VALUES:
  *      0       - success.
  *      -ENOSPC	- insufficient disk resources.
- *      -EIO  	- i/o error.
+ *      -EIO	- i/o error.
  *
- * serialization: 
+ * serialization:
  *	AG lock held on entry/exit;
  *	write lock on the map is held inside;
  *	read lock on the map is held on successful completion;
  *
- * note: new iag transaction: 
+ * note: new iag transaction:
  * . synchronously write iag;
  * . write log of xtree and inode  of imap;
  * . commit;
@@ -2494,7 +2494,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
 	/* acquire the free iag lock */
 	IAGFREE_LOCK(imap);
 
-	/* if there are any iags on the inode map free iag list, 
+	/* if there are any iags on the inode map free iag list,
 	 * allocate the iag from the head of the list.
 	 */
 	if (imap->im_freeiag >= 0) {
@@ -2618,8 +2618,8 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
 		flush_metapage(mp);
 
 		/*
-		 * txCommit(COMMIT_FORCE) will synchronously write address 
-		 * index pages and inode after commit in careful update order 
+		 * txCommit(COMMIT_FORCE) will synchronously write address
+		 * index pages and inode after commit in careful update order
 		 * of address index pages (right to left, bottom up);
 		 */
 		iplist[0] = ipimap;
@@ -2678,11 +2678,11 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
  *
  * FUNCTION:    get the buffer for the specified iag within a fileset
  *		or aggregate inode map.
- *		
+ *
  * PARAMETERS:
- *      imap  	- pointer to inode map control structure.
- *      iagno  	- iag number.
- *      bpp  	- point to buffer pointer to be filled in on successful
+ *      imap	- pointer to inode map control structure.
+ *      iagno	- iag number.
+ *      bpp	- point to buffer pointer to be filled in on successful
  *		  exit.
  *
  * SERIALIZATION:
@@ -2692,7 +2692,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
  *
  * RETURN VALUES:
  *      0       - success.
- *      -EIO  	- i/o error.
+ *      -EIO	- i/o error.
  */
 static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp)
 {
@@ -2718,8 +2718,8 @@ static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp)
  *		the specified bit position.
  *
  * PARAMETERS:
- *      word  	- word to be examined.
- *      start  	- starting bit position.
+ *      word	- word to be examined.
+ *      start	- starting bit position.
  *
  * RETURN VALUES:
  *      bit position of first free bit in the word or 32 if
@@ -2740,24 +2740,24 @@ static int diFindFree(u32 word, int start)
 
 /*
  * NAME:	diUpdatePMap()
- *                                                                    
- * FUNCTION: Update the persistent map in an IAG for the allocation or 
+ *
+ * FUNCTION: Update the persistent map in an IAG for the allocation or
  *	freeing of the specified inode.
- *                                                                    
+ *
  * PRE CONDITIONS: Working map has already been updated for allocate.
  *
  * PARAMETERS:
  *	ipimap	- Incore inode map inode
  *	inum	- Number of inode to mark in permanent map
- *	is_free	- If TRUE indicates inode should be marked freed, otherwise
+ *	is_free	- If 'true' indicates inode should be marked freed, otherwise
  *		  indicates inode should be marked allocated.
  *
- * RETURN VALUES: 
+ * RETURN VALUES:
  *		0 for success
  */
 int
 diUpdatePMap(struct inode *ipimap,
-	     unsigned long inum, boolean_t is_free, struct tblock * tblk)
+	     unsigned long inum, bool is_free, struct tblock * tblk)
 {
 	int rc;
 	struct iag *iagp;
@@ -2793,17 +2793,17 @@ diUpdatePMap(struct inode *ipimap,
 	extno = ino >> L2INOSPEREXT;
 	bitno = ino & (INOSPEREXT - 1);
 	mask = HIGHORDER >> bitno;
-	/* 
+	/*
 	 * mark the inode free in persistent map:
 	 */
-	if (is_free == TRUE) {
+	if (is_free) {
 		/* The inode should have been allocated both in working
 		 * map and in persistent map;
 		 * the inode will be freed from working map at the release
 		 * of last reference release;
 		 */
 		if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) {
-			jfs_error(ipimap->i_sb, 
+			jfs_error(ipimap->i_sb,
 				  "diUpdatePMap: inode %ld not marked as "
 				  "allocated in wmap!", inum);
 		}
@@ -2877,8 +2877,8 @@ diUpdatePMap(struct inode *ipimap,
  *	diExtendFS()
  *
  * function: update imap for extendfs();
- * 
- * note: AG size has been increased s.t. each k old contiguous AGs are 
+ *
+ * note: AG size has been increased s.t. each k old contiguous AGs are
  * coalesced into a new AG;
  */
 int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
@@ -2897,7 +2897,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
 		   atomic_read(&imap->im_numfree));
 
 	/*
-	 *      reconstruct imap 
+	 *      reconstruct imap
 	 *
 	 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
 	 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
@@ -2931,7 +2931,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap)
 		}
 
 		/* leave free iag in the free iag list */
-		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {  
+		if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) {
 		        release_metapage(bp);
 			continue;
 		}
@@ -3115,7 +3115,6 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
 	ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec);
 	ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec);
 	ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec);
-	ip->i_blksize = ip->i_sb->s_blocksize;
 	ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks));
 	ip->i_generation = le32_to_cpu(dip->di_gen);
 
diff --git a/fs/jfs/jfs_imap.h b/fs/jfs/jfs_imap.h
index 6e24465f0f98..4f9c346ed498 100644
--- a/fs/jfs/jfs_imap.h
+++ b/fs/jfs/jfs_imap.h
@@ -1,18 +1,18 @@
 /*
- *   Copyright (c) International Business Machines Corp., 2000-2002
+ *   Copyright (C) International Business Machines Corp., 2000-2002
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef	_H_JFS_IMAP
@@ -45,13 +45,13 @@
 /* get the starting block number of the 4K page of an inode extent
  * that contains ino.
  */
-#define INOPBLK(pxd,ino,l2nbperpg)    	(addressPXD((pxd)) +		\
+#define INOPBLK(pxd,ino,l2nbperpg)	(addressPXD((pxd)) +		\
 	((((ino) & (INOSPEREXT-1)) >> L2INOSPERPAGE) << (l2nbperpg)))
 
 /*
  *	inode allocation map:
- * 
- * inode allocation map consists of 
+ *
+ * inode allocation map consists of
  * . the inode map control page and
  * . inode allocation group pages (per 4096 inodes)
  * which are addressed by standard JFS xtree.
@@ -159,11 +159,11 @@ struct inomap {
 #define	im_maxag	im_imap.in_maxag
 
 extern int diFree(struct inode *);
-extern int diAlloc(struct inode *, boolean_t, struct inode *);
+extern int diAlloc(struct inode *, bool, struct inode *);
 extern int diSync(struct inode *);
 /* external references */
 extern int diUpdatePMap(struct inode *ipimap, unsigned long inum,
-			boolean_t is_free, struct tblock * tblk);
+			bool is_free, struct tblock * tblk);
 extern int diExtendFS(struct inode *ipimap, struct inode *ipbmap);
 extern int diMount(struct inode *);
 extern int diUnmount(struct inode *, int);
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index 54d73716ca8c..94005584445a 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -4,18 +4,18 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */ 
+ */
 #ifndef _H_JFS_INCORE
 #define _H_JFS_INCORE
 
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 495df402916d..4c67ed97682b 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -3,16 +3,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
@@ -61,7 +61,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	inode = new_inode(sb);
 	if (!inode) {
 		jfs_warn("ialloc: new_inode returned NULL!");
-		return inode;
+		return ERR_PTR(-ENOMEM);
 	}
 
 	jfs_inode = JFS_IP(inode);
@@ -69,9 +69,10 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	rc = diAlloc(parent, S_ISDIR(mode), inode);
 	if (rc) {
 		jfs_warn("ialloc: diAlloc returned %d!", rc);
-		make_bad_inode(inode);
+		if (rc == -EIO)
+			make_bad_inode(inode);
 		iput(inode);
-		return NULL;
+		return ERR_PTR(rc);
 	}
 
 	inode->i_uid = current->fsuid;
@@ -97,7 +98,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 		inode->i_flags |= S_NOQUOTA;
 		inode->i_nlink = 0;
 		iput(inode);
-		return NULL;
+		return ERR_PTR(-EDQUOT);
 	}
 
 	inode->i_mode = mode;
@@ -115,7 +116,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	}
 	jfs_inode->mode2 |= mode;
 
-	inode->i_blksize = sb->s_blocksize;
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	jfs_inode->otime = inode->i_ctime.tv_sec;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 1fc48df670c8..0d06ccfaff0e 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -3,16 +3,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef	_H_JFS_INODE
diff --git a/fs/jfs/jfs_lock.h b/fs/jfs/jfs_lock.h
index 70ac9f7d1e00..7d78e83d7c40 100644
--- a/fs/jfs/jfs_lock.h
+++ b/fs/jfs/jfs_lock.h
@@ -1,19 +1,19 @@
 /*
- *   Copyright (c) International Business Machines Corp., 2000-2001
- *   Portions Copyright (c) Christoph Hellwig, 2001-2002
+ *   Copyright (C) International Business Machines Corp., 2000-2001
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_LOCK
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 3315f0b1fbc0..b89c9aba0466 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -4,16 +4,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
@@ -337,7 +337,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
  * PARAMETER:	cd	- commit descriptor
  *
  * RETURN:	end-of-log address
- *			
+ *
  * serialization: LOG_LOCK() held on entry/exit
  */
 static int
@@ -554,7 +554,7 @@ lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
  * PARAMETER:	log
  *
  * RETURN:	0
- *			
+ *
  * serialization: LOG_LOCK() held on entry/exit
  */
 static int lmNextPage(struct jfs_log * log)
@@ -656,7 +656,7 @@ static int lmNextPage(struct jfs_log * log)
  *	page number - redrive pageout of the page at the head of
  *	pageout queue until full page has been written.
  *
- * RETURN:	
+ * RETURN:
  *
  * NOTE:
  *	LOGGC_LOCK serializes log group commit queue, and
@@ -920,10 +920,10 @@ static void lmPostGC(struct lbuf * bp)
  *	this code is called again.
  *
  * PARAMETERS:	log	- log structure
- * 		hard_sync - 1 to force all metadata to be written
+ *		hard_sync - 1 to force all metadata to be written
  *
  * RETURN:	0
- *			
+ *
  * serialization: LOG_LOCK() held on entry/exit
  */
 static int lmLogSync(struct jfs_log * log, int hard_sync)
@@ -1052,7 +1052,7 @@ static int lmLogSync(struct jfs_log * log, int hard_sync)
  * FUNCTION:	write log SYNCPT record for specified log
  *
  * PARAMETERS:	log	  - log structure
- * 		hard_sync - set to 1 to force metadata to be written
+ *		hard_sync - set to 1 to force metadata to be written
  */
 void jfs_syncpt(struct jfs_log *log, int hard_sync)
 {	LOG_LOCK(log);
@@ -1067,7 +1067,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync)
  *	insert filesystem in the active list of the log.
  *
  * PARAMETER:	ipmnt	- file system mount inode
- *		iplog 	- log inode (out)
+ *		iplog	- log inode (out)
  *
  * RETURN:
  *
@@ -1082,7 +1082,7 @@ int lmLogOpen(struct super_block *sb)
 
 	if (sbi->flag & JFS_NOINTEGRITY)
 		return open_dummy_log(sb);
-	
+
 	if (sbi->mntflag & JFS_INLINELOG)
 		return open_inline_log(sb);
 
@@ -1131,7 +1131,7 @@ int lmLogOpen(struct super_block *sb)
 
 	log->bdev = bdev;
 	memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid));
-	
+
 	/*
 	 * initialize log:
 	 */
@@ -1253,13 +1253,13 @@ static int open_dummy_log(struct super_block *sb)
  *	initialize the log from log superblock.
  *	set the log state in the superblock to LOGMOUNT and
  *	write SYNCPT log record.
- *		
+ *
  * PARAMETER:	log	- log structure
  *
  * RETURN:	0	- if ok
  *		-EINVAL	- bad log magic number or superblock dirty
  *		error returned from logwait()
- *			
+ *
  * serialization: single first open thread
  */
 int lmLogInit(struct jfs_log * log)
@@ -1297,7 +1297,7 @@ int lmLogInit(struct jfs_log * log)
 
 	if (!test_bit(log_INLINELOG, &log->flag))
 		log->l2bsize = L2LOGPSIZE;
-	
+
 	/* check for disabled journaling to disk */
 	if (log->no_integrity) {
 		/*
@@ -1651,7 +1651,7 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
  * PARAMETER:	log	- log inode
  *
  * RETURN:	0	- success
- *			
+ *
  * serialization: single last close thread
  */
 int lmLogShutdown(struct jfs_log * log)
@@ -1677,7 +1677,7 @@ int lmLogShutdown(struct jfs_log * log)
 	lrd.type = cpu_to_le16(LOG_SYNCPT);
 	lrd.length = 0;
 	lrd.log.syncpt.sync = 0;
-	
+
 	lsn = lmWriteRecord(log, NULL, &lrd, NULL);
 	bp = log->bp;
 	lp = (struct logpage *) bp->l_ldata;
@@ -1703,7 +1703,7 @@ int lmLogShutdown(struct jfs_log * log)
 	jfs_info("lmLogShutdown: lsn:0x%x page:%d eor:%d",
 		 lsn, log->page, log->eor);
 
-      out:    
+      out:
 	/*
 	 * shutdown per log i/o
 	 */
@@ -1769,7 +1769,7 @@ static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi,
 			lbmFree(bpsuper);
 			return -EIO;
 		}
-		
+
 	}
 
 	/*
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
index 8c6909b80014..a53fb17ea219 100644
--- a/fs/jfs/jfs_logmgr.h
+++ b/fs/jfs/jfs_logmgr.h
@@ -4,16 +4,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef	_H_JFS_LOGMGR
@@ -35,19 +35,19 @@
 /*
  *	log logical volume
  *
- * a log is used to make the commit operation on journalled 
+ * a log is used to make the commit operation on journalled
  * files within the same logical volume group atomic.
  * a log is implemented with a logical volume.
- * there is one log per logical volume group. 
+ * there is one log per logical volume group.
  *
  * block 0 of the log logical volume is not used (ipl etc).
  * block 1 contains a log "superblock" and is used by logFormat(),
- * lmLogInit(), lmLogShutdown(), and logRedo() to record status 
- * of the log but is not otherwise used during normal processing. 
+ * lmLogInit(), lmLogShutdown(), and logRedo() to record status
+ * of the log but is not otherwise used during normal processing.
  * blocks 2 - (N-1) are used to contain log records.
  *
- * when a volume group is varied-on-line, logRedo() must have 
- * been executed before the file systems (logical volumes) in 
+ * when a volume group is varied-on-line, logRedo() must have
+ * been executed before the file systems (logical volumes) in
  * the volume group can be mounted.
  */
 /*
@@ -97,26 +97,26 @@ struct logsuper {
  *	log logical page
  *
  * (this comment should be rewritten !)
- * the header and trailer structures (h,t) will normally have 
+ * the header and trailer structures (h,t) will normally have
  * the same page and eor value.
- * An exception to this occurs when a complete page write is not 
+ * An exception to this occurs when a complete page write is not
  * accomplished on a power failure. Since the hardware may "split write"
- * sectors in the page, any out of order sequence may occur during powerfail 
+ * sectors in the page, any out of order sequence may occur during powerfail
  * and needs to be recognized during log replay.  The xor value is
  * an "exclusive or" of all log words in the page up to eor.  This
  * 32 bit eor is stored with the top 16 bits in the header and the
  * bottom 16 bits in the trailer.  logredo can easily recognize pages
- * that were not completed by reconstructing this eor and checking 
+ * that were not completed by reconstructing this eor and checking
  * the log page.
  *
- * Previous versions of the operating system did not allow split 
- * writes and detected partially written records in logredo by 
- * ordering the updates to the header, trailer, and the move of data 
- * into the logdata area.  The order: (1) data is moved (2) header 
- * is updated (3) trailer is updated.  In logredo, when the header 
- * differed from the trailer, the header and trailer were reconciled 
- * as follows: if h.page != t.page they were set to the smaller of 
- * the two and h.eor and t.eor set to 8 (i.e. empty page). if (only) 
+ * Previous versions of the operating system did not allow split
+ * writes and detected partially written records in logredo by
+ * ordering the updates to the header, trailer, and the move of data
+ * into the logdata area.  The order: (1) data is moved (2) header
+ * is updated (3) trailer is updated.  In logredo, when the header
+ * differed from the trailer, the header and trailer were reconciled
+ * as follows: if h.page != t.page they were set to the smaller of
+ * the two and h.eor and t.eor set to 8 (i.e. empty page). if (only)
  * h.eor != t.eor they were set to the smaller of their two values.
  */
 struct logpage {
@@ -147,20 +147,20 @@ struct logpage {
  * in a  page, pages are written to temporary paging space if
  * if they must be written to disk before commit, and i/o is
  * scheduled for modified pages to their home location after
- * the log records containing the after values and the commit 
+ * the log records containing the after values and the commit
  * record is written to the log on disk, undo discards the copy
  * in main-memory.)
  *
- * a log record consists of a data area of variable length followed by 
+ * a log record consists of a data area of variable length followed by
  * a descriptor of fixed size LOGRDSIZE bytes.
- * the  data area is rounded up to an integral number of 4-bytes and 
+ * the  data area is rounded up to an integral number of 4-bytes and
  * must be no longer than LOGPSIZE.
- * the descriptor is of size of multiple of 4-bytes and aligned on a 
- * 4-byte boundary. 
+ * the descriptor is of size of multiple of 4-bytes and aligned on a
+ * 4-byte boundary.
  * records are packed one after the other in the data area of log pages.
- * (sometimes a DUMMY record is inserted so that at least one record ends 
+ * (sometimes a DUMMY record is inserted so that at least one record ends
  * on every page or the longest record is placed on at most two pages).
- * the field eor in page header/trailer points to the byte following 
+ * the field eor in page header/trailer points to the byte following
  * the last record on a page.
  */
 
@@ -270,11 +270,11 @@ struct lrd {
 		/*
 		 *      NOREDOINOEXT: the inode extent is freed
 		 *
-		 * do not apply after-image records which precede this 
-		 * record in the log with the any of the 4 page block 
-		 * numbers in this inode extent. 
-		 * 
-		 * NOTE: The fileset and pxd fields MUST remain in 
+		 * do not apply after-image records which precede this
+		 * record in the log with the any of the 4 page block
+		 * numbers in this inode extent.
+		 *
+		 * NOTE: The fileset and pxd fields MUST remain in
 		 *       the same fields in the REDOPAGE record format.
 		 *
 		 */
@@ -319,12 +319,10 @@ struct lrd {
 		 * do not apply records which precede this record in the log
 		 * with the same inode number.
 		 *
-		 * NOREDILE must be the first to be written at commit
+		 * NOREDOFILE must be the first to be written at commit
 		 * (last to be read in logredo()) - it prevents
 		 * replay of preceding updates of all preceding generations
-		 * of the inumber esp. the on-disk inode itself, 
-		 * but does NOT prevent
-		 * replay of the 
+		 * of the inumber esp. the on-disk inode itself.
 		 */
 		struct {
 			__le32 fileset;	/* 4: fileset number */
@@ -332,7 +330,7 @@ struct lrd {
 		} noredofile;
 
 		/*
-		 *      ? NEWPAGE: 
+		 *      ? NEWPAGE:
 		 *
 		 * metadata type dependent
 		 */
@@ -464,7 +462,7 @@ struct lbuf {
 	s64 l_blkno;		/* 8: log page block number */
 	caddr_t l_ldata;	/* 4: data page */
 	struct page *l_page;	/* The page itself */
-	uint l_offset;		/* Offset of l_ldata within the page */	
+	uint l_offset;		/* Offset of l_ldata within the page */
 
 	wait_queue_head_t l_ioevent;	/* 4: i/o done event */
 };
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index e1e0a6e6ebdf..0cccd1c39d75 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -4,16 +4,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
@@ -257,7 +257,7 @@ static sector_t metapage_get_blocks(struct inode *inode, sector_t lblock,
 	int rc = 0;
 	int xflag;
 	s64 xaddr;
-	sector_t file_blocks = (inode->i_size + inode->i_blksize - 1) >>
+	sector_t file_blocks = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
 			       inode->i_blkbits;
 
 	if (lblock >= file_blocks)
@@ -461,7 +461,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 				goto add_failed;
 		if (!bio->bi_size)
 			goto dump_bio;
-		
+
 		submit_bio(WRITE, bio);
 	}
 	if (redirty)
@@ -648,7 +648,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
 			jfs_err("logical_size = %d, size = %d",
 				mp->logical_size, size);
 			dump_stack();
-			goto unlock; 
+			goto unlock;
 		}
 		mp->count++;
 		lock_metapage(mp);
@@ -658,7 +658,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
 					  "__get_metapage: using a "
 					  "discarded metapage");
 				discard_metapage(mp);
-				goto unlock; 
+				goto unlock;
 			}
 			clear_bit(META_discard, &mp->flag);
 		}
diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h
index d17a3290f5aa..d94f8d9e87d7 100644
--- a/fs/jfs/jfs_metapage.h
+++ b/fs/jfs/jfs_metapage.h
@@ -4,16 +4,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef	_H_JFS_METAPAGE
@@ -33,7 +33,7 @@ struct metapage {
 	unsigned long flag;	/* See Below */
 	unsigned long count;	/* Reference count */
 	void *data;		/* Data pointer */
-	sector_t index; 	/* block address of page */
+	sector_t index;		/* block address of page */
 	wait_queue_head_t wait;
 
 	/* implementation */
@@ -65,10 +65,10 @@ extern struct metapage *__get_metapage(struct inode *inode,
 				  int absolute, unsigned long new);
 
 #define read_metapage(inode, lblock, size, absolute)\
-	 __get_metapage(inode, lblock, size, absolute, FALSE)
+	 __get_metapage(inode, lblock, size, absolute, false)
 
 #define get_metapage(inode, lblock, size, absolute)\
-	 __get_metapage(inode, lblock, size, absolute, TRUE)
+	 __get_metapage(inode, lblock, size, absolute, true)
 
 extern void release_metapage(struct metapage *);
 extern void grab_metapage(struct metapage *);
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 032d111bc330..4dd479834897 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -3,16 +3,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
@@ -21,18 +21,18 @@
  *
  * note: file system in transition to aggregate/fileset:
  *
- * file system mount is interpreted as the mount of aggregate, 
- * if not already mounted, and mount of the single/only fileset in 
+ * file system mount is interpreted as the mount of aggregate,
+ * if not already mounted, and mount of the single/only fileset in
  * the aggregate;
  *
  * a file system/aggregate is represented by an internal inode
  * (aka mount inode) initialized with aggregate superblock;
- * each vfs represents a fileset, and points to its "fileset inode 
+ * each vfs represents a fileset, and points to its "fileset inode
  * allocation map inode" (aka fileset inode):
- * (an aggregate itself is structured recursively as a filset: 
- * an internal vfs is constructed and points to its "fileset inode 
- * allocation map inode" (aka aggregate inode) where each inode 
- * represents a fileset inode) so that inode number is mapped to 
+ * (an aggregate itself is structured recursively as a filset:
+ * an internal vfs is constructed and points to its "fileset inode
+ * allocation map inode" (aka aggregate inode) where each inode
+ * represents a fileset inode) so that inode number is mapped to
  * on-disk inode in uniform way at both aggregate and fileset level;
  *
  * each vnode/inode of a fileset is linked to its vfs (to facilitate
@@ -41,7 +41,7 @@
  * per aggregate information, e.g., block size, etc.) as well as
  * its file set inode.
  *
- *   aggregate 
+ *   aggregate
  *   ipmnt
  *   mntvfs -> fileset ipimap+ -> aggregate ipbmap -> aggregate ipaimap;
  *             fileset vfs     -> vp(1) <-> ... <-> vp(n) <->vproot;
@@ -88,7 +88,7 @@ int jfs_mount(struct super_block *sb)
 	struct inode *ipbmap = NULL;
 
 	/*
-	 * read/validate superblock 
+	 * read/validate superblock
 	 * (initialize mount inode from the superblock)
 	 */
 	if ((rc = chkSuper(sb))) {
@@ -238,7 +238,7 @@ int jfs_mount(struct super_block *sb)
  */
 int jfs_mount_rw(struct super_block *sb, int remount)
 {
-	struct jfs_sb_info *sbi = JFS_SBI(sb);  
+	struct jfs_sb_info *sbi = JFS_SBI(sb);
 	int rc;
 
 	/*
@@ -291,7 +291,7 @@ int jfs_mount_rw(struct super_block *sb, int remount)
 /*
  *	chkSuper()
  *
- * validate the superblock of the file system to be mounted and 
+ * validate the superblock of the file system to be mounted and
  * get the file system parameters.
  *
  * returns
@@ -426,7 +426,7 @@ int updateSuper(struct super_block *sb, uint state)
 			jfs_err("updateSuper: bad state");
 	} else if (sbi->state == FM_DIRTY)
 		return 0;
-	
+
 	if ((rc = readSuper(sb, &bh)))
 		return rc;
 
@@ -486,9 +486,9 @@ int readSuper(struct super_block *sb, struct buffer_head **bpp)
  * for this file system past this point in log.
  * it is harmless if mount fails.
  *
- * note: MOUNT record is at aggregate level, not at fileset level, 
+ * note: MOUNT record is at aggregate level, not at fileset level,
  * since log records of previous mounts of a fileset
- * (e.g., AFTER record of extent allocation) have to be processed 
+ * (e.g., AFTER record of extent allocation) have to be processed
  * to update block allocation map at aggregate level.
  */
 static int logMOUNT(struct super_block *sb)
diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h
index 682cf1a68a18..884fc21ab8ee 100644
--- a/fs/jfs/jfs_superblock.h
+++ b/fs/jfs/jfs_superblock.h
@@ -3,16 +3,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef	_H_JFS_SUPERBLOCK
@@ -21,14 +21,14 @@
 /*
  * make the magic number something a human could read
  */
-#define JFS_MAGIC 	"JFS1"	/* Magic word */
+#define JFS_MAGIC	"JFS1"	/* Magic word */
 
 #define JFS_VERSION	2	/* Version number: Version 2 */
 
 #define LV_NAME_SIZE	11	/* MUST BE 11 for OS/2 boot sector */
 
-/* 
- *	aggregate superblock 
+/*
+ *	aggregate superblock
  *
  * The name superblock is too close to super_block, so the name has been
  * changed to jfs_superblock.  The utilities are still using the old name.
@@ -40,7 +40,7 @@ struct jfs_superblock {
 	__le64 s_size;		/* 8: aggregate size in hardware/LVM blocks;
 				 * VFS: number of blocks
 				 */
-	__le32 s_bsize;		/* 4: aggregate block size in bytes; 
+	__le32 s_bsize;		/* 4: aggregate block size in bytes;
 				 * VFS: fragment size
 				 */
 	__le16 s_l2bsize;	/* 2: log2 of s_bsize */
@@ -54,7 +54,7 @@ struct jfs_superblock {
 	__le32 s_flag;		/* 4: aggregate attributes:
 				 *    see jfs_filsys.h
 				 */
-	__le32 s_state;		/* 4: mount/unmount/recovery state: 
+	__le32 s_state;		/* 4: mount/unmount/recovery state:
 				 *    see jfs_filsys.h
 				 */
 	__le32 s_compress;		/* 4: > 0 if data compression */
@@ -75,11 +75,11 @@ struct jfs_superblock {
 	struct timestruc_t s_time;	/* 8: time last updated */
 
 	__le32 s_fsckloglen;	/* 4: Number of filesystem blocks reserved for
-				 *    the fsck service log.  
+				 *    the fsck service log.
 				 *    N.B. These blocks are divided among the
 				 *         versions kept.  This is not a per
 				 *         version size.
-				 *    N.B. These blocks are included in the 
+				 *    N.B. These blocks are included in the
 				 *         length field of s_fsckpxd.
 				 */
 	s8 s_fscklog;		/* 1: which fsck service log is most recent
@@ -87,7 +87,7 @@ struct jfs_superblock {
 				 *    1 => the first one
 				 *    2 => the 2nd one
 				 */
-	char s_fpack[11];	/* 11: file system volume name 
+	char s_fpack[11];	/* 11: file system volume name
 				 *     N.B. This must be 11 bytes to
 				 *          conform with the OS/2 BootSector
 				 *          requirements
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index efbb586bed4b..81f6f04af192 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -4,16 +4,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
@@ -282,7 +282,7 @@ int txInit(void)
 	TxLockVHWM = (nTxLock * 8) / 10;
 
 	size = sizeof(struct tblock) * nTxBlock;
-	TxBlock = (struct tblock *) vmalloc(size);
+	TxBlock = vmalloc(size);
 	if (TxBlock == NULL)
 		return -ENOMEM;
 
@@ -307,7 +307,7 @@ int txInit(void)
 	 * tlock id = 0 is reserved.
 	 */
 	size = sizeof(struct tlock) * nTxLock;
-	TxLock = (struct tlock *) vmalloc(size);
+	TxLock = vmalloc(size);
 	if (TxLock == NULL) {
 		vfree(TxBlock);
 		return -ENOMEM;
@@ -2026,8 +2026,6 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
 		 * truncate entry XAD[twm == next - 1]:
 		 */
 		if (twm == next - 1) {
-			struct pxd_lock *pxdlock;
-
 			/* format a maplock for txUpdateMap() to update bmap
 			 * to free truncated delta extent of the truncated
 			 * entry XAD[next - 1];
@@ -2393,7 +2391,7 @@ static void txUpdateMap(struct tblock * tblk)
 	 * unlock mapper/write lock
 	 */
 	if (tblk->xflag & COMMIT_CREATE) {
-		diUpdatePMap(ipimap, tblk->ino, FALSE, tblk);
+		diUpdatePMap(ipimap, tblk->ino, false, tblk);
 		/* update persistent block allocation map
 		 * for the allocation of inode extent;
 		 */
@@ -2403,7 +2401,7 @@ static void txUpdateMap(struct tblock * tblk)
 		txAllocPMap(ipimap, (struct maplock *) & pxdlock, tblk);
 	} else if (tblk->xflag & COMMIT_DELETE) {
 		ip = tblk->u.ip;
-		diUpdatePMap(ipimap, ip->i_ino, TRUE, tblk);
+		diUpdatePMap(ipimap, ip->i_ino, true, tblk);
 		iput(ip);
 	}
 }
@@ -2451,7 +2449,7 @@ static void txAllocPMap(struct inode *ip, struct maplock * maplock,
 			if (xad->flag & (XAD_NEW | XAD_EXTENDED)) {
 				xaddr = addressXAD(xad);
 				xlen = lengthXAD(xad);
-				dbUpdatePMap(ipbmap, FALSE, xaddr,
+				dbUpdatePMap(ipbmap, false, xaddr,
 					     (s64) xlen, tblk);
 				xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
 				jfs_info("allocPMap: xaddr:0x%lx xlen:%d",
@@ -2462,7 +2460,7 @@ static void txAllocPMap(struct inode *ip, struct maplock * maplock,
 		pxdlock = (struct pxd_lock *) maplock;
 		xaddr = addressPXD(&pxdlock->pxd);
 		xlen = lengthPXD(&pxdlock->pxd);
-		dbUpdatePMap(ipbmap, FALSE, xaddr, (s64) xlen, tblk);
+		dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen, tblk);
 		jfs_info("allocPMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen);
 	} else {		/* (maplock->flag & mlckALLOCPXDLIST) */
 
@@ -2471,7 +2469,7 @@ static void txAllocPMap(struct inode *ip, struct maplock * maplock,
 		for (n = 0; n < pxdlistlock->count; n++, pxd++) {
 			xaddr = addressPXD(pxd);
 			xlen = lengthPXD(pxd);
-			dbUpdatePMap(ipbmap, FALSE, xaddr, (s64) xlen,
+			dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen,
 				     tblk);
 			jfs_info("allocPMap: xaddr:0x%lx xlen:%d",
 				 (ulong) xaddr, xlen);
@@ -2513,7 +2511,7 @@ void txFreeMap(struct inode *ip,
 				if (!(xad->flag & XAD_NEW)) {
 					xaddr = addressXAD(xad);
 					xlen = lengthXAD(xad);
-					dbUpdatePMap(ipbmap, TRUE, xaddr,
+					dbUpdatePMap(ipbmap, true, xaddr,
 						     (s64) xlen, tblk);
 					jfs_info("freePMap: xaddr:0x%lx "
 						 "xlen:%d",
@@ -2524,7 +2522,7 @@ void txFreeMap(struct inode *ip,
 			pxdlock = (struct pxd_lock *) maplock;
 			xaddr = addressPXD(&pxdlock->pxd);
 			xlen = lengthPXD(&pxdlock->pxd);
-			dbUpdatePMap(ipbmap, TRUE, xaddr, (s64) xlen,
+			dbUpdatePMap(ipbmap, true, xaddr, (s64) xlen,
 				     tblk);
 			jfs_info("freePMap: xaddr:0x%lx xlen:%d",
 				 (ulong) xaddr, xlen);
@@ -2535,7 +2533,7 @@ void txFreeMap(struct inode *ip,
 			for (n = 0; n < pxdlistlock->count; n++, pxd++) {
 				xaddr = addressPXD(pxd);
 				xlen = lengthPXD(pxd);
-				dbUpdatePMap(ipbmap, TRUE, xaddr,
+				dbUpdatePMap(ipbmap, true, xaddr,
 					     (s64) xlen, tblk);
 				jfs_info("freePMap: xaddr:0x%lx xlen:%d",
 					 (ulong) xaddr, xlen);
diff --git a/fs/jfs/jfs_txnmgr.h b/fs/jfs/jfs_txnmgr.h
index 0e4dc4514c47..7863cf21afca 100644
--- a/fs/jfs/jfs_txnmgr.h
+++ b/fs/jfs/jfs_txnmgr.h
@@ -3,16 +3,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_TXNMGR
@@ -179,7 +179,7 @@ struct linelock {
 	/* (8) */
 
 	struct lv lv[20];	/* 40: */
-}; 				/* (48) */
+};				/* (48) */
 
 #define dt_lock	linelock
 
@@ -211,8 +211,8 @@ struct xtlock {
  * at tlock.lock/linelock: watch for alignment;
  * N.B. next field may be set by linelock, and should not
  * be modified by maplock;
- * N.B. index of the first pxdlock specifies index of next 
- * free maplock (i.e., number of maplock) in the tlock; 
+ * N.B. index of the first pxdlock specifies index of next
+ * free maplock (i.e., number of maplock) in the tlock;
  */
 struct maplock {
 	lid_t next;		/* 2: */
diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h
index 5bfad39a2078..09b252958687 100644
--- a/fs/jfs/jfs_types.h
+++ b/fs/jfs/jfs_types.h
@@ -57,10 +57,6 @@ struct timestruc_t {
 #define	HIGHORDER	0x80000000u	/* high order bit on            */
 #define	ONES		0xffffffffu	/* all bit on                   */
 
-typedef int boolean_t;
-#define TRUE 1
-#define FALSE 0
-
 /*
  *	logical xd (lxd)
  */
diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c
index 21eaf7ac0fcb..a386f48c73fc 100644
--- a/fs/jfs/jfs_umount.c
+++ b/fs/jfs/jfs_umount.c
@@ -3,16 +3,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
@@ -22,8 +22,8 @@
  * note: file system in transition to aggregate/fileset:
  * (ref. jfs_mount.c)
  *
- * file system unmount is interpreted as mount of the single/only 
- * fileset in the aggregate and, if unmount of the last fileset, 
+ * file system unmount is interpreted as mount of the single/only
+ * fileset in the aggregate and, if unmount of the last fileset,
  * as unmount of the aggerate;
  */
 
@@ -60,13 +60,13 @@ int jfs_umount(struct super_block *sb)
 	jfs_info("UnMount JFS: sb:0x%p", sb);
 
 	/*
-	 *      update superblock and close log 
+	 *      update superblock and close log
 	 *
 	 * if mounted read-write and log based recovery was enabled
 	 */
 	if ((log = sbi->log))
 		/*
-		 * Wait for outstanding transactions to be written to log: 
+		 * Wait for outstanding transactions to be written to log:
 		 */
 		jfs_flush_journal(log, 2);
 
@@ -112,17 +112,17 @@ int jfs_umount(struct super_block *sb)
 
 	/*
 	 * ensure all file system file pages are propagated to their
-	 * home blocks on disk (and their in-memory buffer pages are 
+	 * home blocks on disk (and their in-memory buffer pages are
 	 * invalidated) BEFORE updating file system superblock state
-	 * (to signify file system is unmounted cleanly, and thus in 
-	 * consistent state) and log superblock active file system 
+	 * (to signify file system is unmounted cleanly, and thus in
+	 * consistent state) and log superblock active file system
 	 * list (to signify skip logredo()).
 	 */
 	if (log) {		/* log = NULL if read-only mount */
 		updateSuper(sb, FM_CLEAN);
 
 		/*
-		 * close log: 
+		 * close log:
 		 *
 		 * remove file system from log active file system list.
 		 */
@@ -142,7 +142,7 @@ int jfs_umount_rw(struct super_block *sb)
 		return 0;
 
 	/*
-	 * close log: 
+	 * close log:
 	 *
 	 * remove file system from log active file system list.
 	 */
diff --git a/fs/jfs/jfs_unicode.c b/fs/jfs/jfs_unicode.c
index f327decfb155..c7de6f5bbefc 100644
--- a/fs/jfs/jfs_unicode.c
+++ b/fs/jfs/jfs_unicode.c
@@ -3,16 +3,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
@@ -57,8 +57,8 @@ int jfs_strfromUCS_le(char *to, const __le16 * from,
 					warn--;
 					warn_again--;
 					printk(KERN_ERR
-			"non-latin1 character 0x%x found in JFS file name\n", 
-		       			       le16_to_cpu(from[i]));
+			"non-latin1 character 0x%x found in JFS file name\n",
+					       le16_to_cpu(from[i]));
 					printk(KERN_ERR
 				"mount with iocharset=utf8 to access\n");
 				}
@@ -124,7 +124,7 @@ int get_UCSname(struct component_name * uniName, struct dentry *dentry)
 	    kmalloc((length + 1) * sizeof(wchar_t), GFP_NOFS);
 
 	if (uniName->name == NULL)
-		return -ENOSPC;
+		return -ENOMEM;
 
 	uniName->namlen = jfs_strtoUCS(uniName->name, dentry->d_name.name,
 				       length, nls_tab);
diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h
index 69e25ebe87ac..3fbb3a225590 100644
--- a/fs/jfs/jfs_unicode.h
+++ b/fs/jfs/jfs_unicode.h
@@ -1,19 +1,19 @@
 /*
- *   Copyright (c) International Business Machines Corp., 2000-2002
- *   Portions Copyright (c) Christoph Hellwig, 2001-2002
+ *   Copyright (C) International Business Machines Corp., 2000-2002
+ *   Portions Copyright (C) Christoph Hellwig, 2001-2002
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_UNICODE
diff --git a/fs/jfs/jfs_uniupr.c b/fs/jfs/jfs_uniupr.c
index 4ab185d26308..cfe50666d312 100644
--- a/fs/jfs/jfs_uniupr.c
+++ b/fs/jfs/jfs_uniupr.c
@@ -1,18 +1,18 @@
 /*
- *   Copyright (c) International Business Machines Corp., 2000-2002
+ *   Copyright (C) International Business Machines Corp., 2000-2002
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index 25e9990bccd1..88b6cc535bf2 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -1,5 +1,5 @@
 /*
- *   Copyright (c) International Business Machines Corp., 2000-2002
+ *   Copyright (C) International Business Machines Corp., 2000-2002
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index e72f4ebb6e9c..e98eb03e5310 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -3,16 +3,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 /*
@@ -2428,7 +2428,7 @@ printf("xtUpdate.updateLeft.split p:0x%p\n", p);
  * return:
  */
 int xtAppend(tid_t tid,		/* transaction id */
-	     struct inode *ip, int xflag, s64 xoff, s32 maxblocks,	
+	     struct inode *ip, int xflag, s64 xoff, s32 maxblocks,
 	     s32 * xlenp,	/* (in/out) */
 	     s64 * xaddrp,	/* (in/out) */
 	     int flag)
@@ -2499,7 +2499,7 @@ int xtAppend(tid_t tid,		/* transaction id */
 	pxdlist.maxnpxd = pxdlist.npxd = 0;
 	pxd = &pxdlist.pxd[0];
 	nblocks = JFS_SBI(ip->i_sb)->nbperpage;
-	for (; nsplit > 0; nsplit--, pxd++, xaddr += nblocks, maxblocks -= nblocks) {	
+	for (; nsplit > 0; nsplit--, pxd++, xaddr += nblocks, maxblocks -= nblocks) {
 		if ((rc = dbAllocBottomUp(ip, xaddr, (s64) nblocks)) == 0) {
 			PXDaddress(pxd, xaddr);
 			PXDlength(pxd, nblocks);
@@ -2514,7 +2514,7 @@ int xtAppend(tid_t tid,		/* transaction id */
 		goto out;
 	}
 
-	xlen = min(xlen, maxblocks);	
+	xlen = min(xlen, maxblocks);
 
 	/*
 	 * allocate data extent requested
@@ -2964,7 +2964,7 @@ xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad,	/* old XAD */
 			cmSetXD(ip, cp, pno, dxaddr, nblks);
 
 			/* release the cbuf, mark it as modified */
-			cmPut(cp, TRUE);
+			cmPut(cp, true);
 
 			dxaddr += nblks;
 			sxaddr += nblks;
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index af668a80b40f..164f6f2b1019 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -1,18 +1,18 @@
 /*
- *   Copyright (c) International Business Machines Corp., 2000-2002
+ *   Copyright (C) International Business Machines Corp., 2000-2002
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 #ifndef _H_JFS_XTREE
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 295268ad231b..a6a8c16c872c 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -4,16 +4,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
@@ -41,7 +41,7 @@ static s64 commitZeroLink(tid_t, struct inode *);
 /*
  * NAME:	free_ea_wmap(inode)
  *
- * FUNCTION:	free uncommitted extended attributes from working map 
+ * FUNCTION:	free uncommitted extended attributes from working map
  *
  */
 static inline void free_ea_wmap(struct inode *inode)
@@ -62,7 +62,7 @@ static inline void free_ea_wmap(struct inode *inode)
  * FUNCTION:	create a regular file in the parent directory <dip>
  *		with name = <from dentry> and mode = <mode>
  *
- * PARAMETER:	dip 	- parent directory vnode
+ * PARAMETER:	dip	- parent directory vnode
  *		dentry	- dentry of new file
  *		mode	- create mode (rwxrwxrwx).
  *		nd- nd struct
@@ -97,8 +97,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
 	 * begin the transaction before we search the directory.
 	 */
 	ip = ialloc(dip, mode);
-	if (ip == NULL) {
-		rc = -ENOSPC;
+	if (IS_ERR(ip)) {
+		rc = PTR_ERR(ip);
 		goto out2;
 	}
 
@@ -190,7 +190,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
  * FUNCTION:	create a child directory in the parent directory <dip>
  *		with name = <from dentry> and mode = <mode>
  *
- * PARAMETER:	dip 	- parent directory vnode
+ * PARAMETER:	dip	- parent directory vnode
  *		dentry	- dentry of child directory
  *		mode	- create mode (rwxrwxrwx).
  *
@@ -231,8 +231,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
 	 * begin the transaction before we search the directory.
 	 */
 	ip = ialloc(dip, S_IFDIR | mode);
-	if (ip == NULL) {
-		rc = -ENOSPC;
+	if (IS_ERR(ip)) {
+		rc = PTR_ERR(ip);
 		goto out2;
 	}
 
@@ -292,7 +292,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
 	mark_inode_dirty(ip);
 
 	/* update parent directory inode */
-	dip->i_nlink++;		/* for '..' from child directory */
+	inc_nlink(dip);		/* for '..' from child directory */
 	dip->i_ctime = dip->i_mtime = CURRENT_TIME;
 	mark_inode_dirty(dip);
 
@@ -324,7 +324,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
  *
  * FUNCTION:	remove a link to child directory
  *
- * PARAMETER:	dip 	- parent inode
+ * PARAMETER:	dip	- parent inode
  *		dentry	- child directory dentry
  *
  * RETURN:	-EINVAL	- if name is . or ..
@@ -332,10 +332,10 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
  *		errors from subroutines
  *
  * note:
- * if other threads have the directory open when the last link 
- * is removed, the "." and ".." entries, if present, are removed before 
- * rmdir() returns and no new entries may be created in the directory, 
- * but the directory is not removed until the last reference to 
+ * if other threads have the directory open when the last link
+ * is removed, the "." and ".." entries, if present, are removed before
+ * rmdir() returns and no new entries may be created in the directory,
+ * but the directory is not removed until the last reference to
  * the directory is released (cf.unlink() of regular file).
  */
 static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
@@ -393,9 +393,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
 	/* update parent directory's link count corresponding
 	 * to ".." entry of the target directory deleted
 	 */
-	dip->i_nlink--;
 	dip->i_ctime = dip->i_mtime = CURRENT_TIME;
-	mark_inode_dirty(dip);
+	inode_dec_link_count(dip);
 
 	/*
 	 * OS/2 could have created EA and/or ACL
@@ -415,7 +414,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
 	JFS_IP(ip)->acl.flag = 0;
 
 	/* mark the target directory as deleted */
-	ip->i_nlink = 0;
+	clear_nlink(ip);
 	mark_inode_dirty(ip);
 
 	rc = txCommit(tid, 2, &iplist[0], 0);
@@ -447,11 +446,11 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
 /*
  * NAME:	jfs_unlink(dip, dentry)
  *
- * FUNCTION:	remove a link to object <vp> named by <name> 
+ * FUNCTION:	remove a link to object <vp> named by <name>
  *		from parent directory <dvp>
  *
- * PARAMETER:	dip 	- inode of parent directory
- *		dentry 	- dentry of object to be removed
+ * PARAMETER:	dip	- inode of parent directory
+ *		dentry	- dentry of object to be removed
  *
  * RETURN:	errors from subroutines
  *
@@ -515,8 +514,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
 	mark_inode_dirty(dip);
 
 	/* update target's inode */
-	ip->i_nlink--;
-	mark_inode_dirty(ip);
+	inode_dec_link_count(ip);
 
 	/*
 	 *      commit zero link count object
@@ -600,7 +598,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
  *
  * FUNCTION:    for non-directory, called by jfs_remove(),
  *		truncate a regular file, directory or symbolic
- *		link to zero length. return 0 if type is not 
+ *		link to zero length. return 0 if type is not
  *		one of these.
  *
  *		if the file is currently associated with a VM segment
@@ -610,7 +608,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
  *		map by ctrunc1.
  *		if there is no VM segment on entry, the resources are
  *		freed in both work and permanent map.
- *		(? for temporary file - memory object is cached even 
+ *		(? for temporary file - memory object is cached even
  *		after no reference:
  *		reference count > 0 -   )
  *
@@ -664,7 +662,7 @@ static s64 commitZeroLink(tid_t tid, struct inode *ip)
 
 	/*
 	 * free xtree/data (truncate to zero length):
-	 * free xtree/data pages from cache if COMMIT_PWMAP, 
+	 * free xtree/data pages from cache if COMMIT_PWMAP,
 	 * free xtree/data blocks from persistent block map, and
 	 * free xtree/data blocks from working block map if COMMIT_PWMAP;
 	 */
@@ -679,7 +677,7 @@ static s64 commitZeroLink(tid_t tid, struct inode *ip)
  * NAME:	jfs_free_zero_link()
  *
  * FUNCTION:    for non-directory, called by iClose(),
- *		free resources of a file from cache and WORKING map 
+ *		free resources of a file from cache and WORKING map
  *		for a file previously committed with zero link count
  *		while associated with a pager object,
  *
@@ -764,7 +762,7 @@ void jfs_free_zero_link(struct inode *ip)
  * FUNCTION:	create a link to <vp> by the name = <name>
  *		in the parent directory <dvp>
  *
- * PARAMETER:	vp 	- target object
+ * PARAMETER:	vp	- target object
  *		dvp	- parent directory of new link
  *		name	- name of new link to target object
  *		crp	- credential
@@ -824,7 +822,7 @@ static int jfs_link(struct dentry *old_dentry,
 		goto free_dname;
 
 	/* update object inode */
-	ip->i_nlink++;		/* for new link */
+	inc_nlink(ip);		/* for new link */
 	ip->i_ctime = CURRENT_TIME;
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
 	mark_inode_dirty(dir);
@@ -835,7 +833,7 @@ static int jfs_link(struct dentry *old_dentry,
 	rc = txCommit(tid, 2, &iplist[0], 0);
 
 	if (rc) {
-		ip->i_nlink--;
+		ip->i_nlink--; /* never instantiated */
 		iput(ip);
 	} else
 		d_instantiate(dentry, ip);
@@ -860,8 +858,8 @@ static int jfs_link(struct dentry *old_dentry,
  *		        in directory <dip>
  *
  * PARAMETER:	dip	    - parent directory vnode
- *		        dentry 	- dentry of symbolic link
- *		        name    - the path name of the existing object 
+ *		        dentry	- dentry of symbolic link
+ *		        name    - the path name of the existing object
  *			              that will be the source of the link
  *
  * RETURN:	errors from subroutines
@@ -908,8 +906,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 	 * (iAlloc() returns new, locked inode)
 	 */
 	ip = ialloc(dip, S_IFLNK | 0777);
-	if (ip == NULL) {
-		rc = -ENOSPC;
+	if (IS_ERR(ip)) {
+		rc = PTR_ERR(ip);
 		goto out2;
 	}
 
@@ -928,7 +926,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 	tblk->u.ixpxd = JFS_IP(ip)->ixpxd;
 
 	/* fix symlink access permission
-	 * (dir_create() ANDs in the u.u_cmask, 
+	 * (dir_create() ANDs in the u.u_cmask,
 	 * but symlinks really need to be 777 access)
 	 */
 	ip->i_mode |= 0777;
@@ -969,7 +967,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 		ip->i_mapping->a_ops = &jfs_aops;
 
 		/*
-		 * even though the data of symlink object (source 
+		 * even though the data of symlink object (source
 		 * path name) is treated as non-journaled user data,
 		 * it is read/written thru buffer cache for performance.
 		 */
@@ -980,7 +978,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 		xlen = xsize >> JFS_SBI(sb)->l2bsize;
 		if ((rc = xtInsert(tid, ip, 0, 0, xlen, &xaddr, 0))) {
 			txAbort(tid, 0);
-			rc = -ENOSPC;
 			goto out3;
 		}
 		extent = xaddr;
@@ -1155,9 +1152,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			      old_ip->i_ino, JFS_RENAME);
 		if (rc)
 			goto out4;
-		new_ip->i_nlink--;
+		drop_nlink(new_ip);
 		if (S_ISDIR(new_ip->i_mode)) {
-			new_ip->i_nlink--;
+			drop_nlink(new_ip);
 			if (new_ip->i_nlink) {
 				mutex_unlock(&JFS_IP(new_ip)->commit_mutex);
 				if (old_dir != new_dir)
@@ -1178,7 +1175,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			/* free block resources */
 			if ((new_size = commitZeroLink(tid, new_ip)) < 0) {
 				txAbort(tid, 1);	/* Marks FS Dirty */
-				rc = new_size;		
+				rc = new_size;
 				goto out4;
 			}
 			tblk = tid_to_tblock(tid);
@@ -1208,7 +1205,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto out4;
 		}
 		if (S_ISDIR(old_ip->i_mode))
-			new_dir->i_nlink++;
+			inc_nlink(new_dir);
 	}
 	/*
 	 * Remove old directory entry
@@ -1223,7 +1220,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto out4;
 	}
 	if (S_ISDIR(old_ip->i_mode)) {
-		old_dir->i_nlink--;
+		drop_nlink(old_dir);
 		if (old_dir != new_dir) {
 			/*
 			 * Change inode number of parent for moved directory
@@ -1294,7 +1291,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		new_size = xtTruncate_pmap(tid, new_ip, new_size);
 		if (new_size < 0) {
 			txAbort(tid, 1);
-			rc = new_size;		
+			rc = new_size;
 		} else
 			rc = txCommit(tid, 1, &new_ip, COMMIT_SYNC);
 		txEnd(tid);
@@ -1352,8 +1349,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
 		goto out;
 
 	ip = ialloc(dir, mode);
-	if (ip == NULL) {
-		rc = -ENOSPC;
+	if (IS_ERR(ip)) {
+		rc = PTR_ERR(ip);
 		goto out1;
 	}
 	jfs_ip = JFS_IP(ip);
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 45180361871c..79d625f3f733 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -3,16 +3,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 */
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 143bcd1d5eaa..9c1c6e0e633d 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -4,16 +4,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
@@ -82,7 +82,7 @@ static void jfs_handle_error(struct super_block *sb)
 			"as read-only\n",
 			sb->s_id);
 		sb->s_flags |= MS_RDONLY;
-	} 
+	}
 
 	/* nothing is done for continue beyond marking the superblock dirty */
 }
@@ -422,7 +422,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL);
 	if (!sbi)
-		return -ENOSPC;
+		return -ENOMEM;
 	sb->s_fs_info = sbi;
 	sbi->sb = sb;
 	sbi->uid = sbi->gid = sbi->umask = -1;
@@ -775,7 +775,7 @@ static int __init init_jfs_fs(void)
 	int rc;
 
 	jfs_inode_cachep =
-	    kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0, 
+	    kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0,
 			    SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
 			    init_once, NULL);
 	if (jfs_inode_cachep == NULL)
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index 16477b3835e1..cee43f36f51d 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -3,16 +3,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 9bc5b7c055ce..4c7985ebca92 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -4,16 +4,16 @@
  *
  *   This program is free software;  you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or 
+ *   the Free Software Foundation; either version 2 of the License, or
  *   (at your option) any later version.
- * 
+ *
  *   This program is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  *   the GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software 
+ *   along with this program;  if not, write to the Free Software
  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  */
 
@@ -57,7 +57,7 @@
  *
  *   0            4                   4 + EA_SIZE(ea1)
  *   +------------+-------------------+--------------------+-----
- *   | Overall EA | First FEA Element | Second FEA Element | ..... 
+ *   | Overall EA | First FEA Element | Second FEA Element | .....
  *   | List Size  |                   |                    |
  *   +------------+-------------------+--------------------+-----
  *
@@ -97,26 +97,26 @@ static inline int is_os2_xattr(struct jfs_ea *ea)
 	 */
 	if ((ea->namelen >= XATTR_SYSTEM_PREFIX_LEN) &&
 	    !strncmp(ea->name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-		return FALSE;
+		return false;
 	/*
 	 * Check for "user."
 	 */
 	if ((ea->namelen >= XATTR_USER_PREFIX_LEN) &&
 	    !strncmp(ea->name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
-		return FALSE;
+		return false;
 	/*
 	 * Check for "security."
 	 */
 	if ((ea->namelen >= XATTR_SECURITY_PREFIX_LEN) &&
 	    !strncmp(ea->name, XATTR_SECURITY_PREFIX,
 		     XATTR_SECURITY_PREFIX_LEN))
-		return FALSE;
+		return false;
 	/*
 	 * Check for "trusted."
 	 */
 	if ((ea->namelen >= XATTR_TRUSTED_PREFIX_LEN) &&
 	    !strncmp(ea->name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
-		return FALSE;
+		return false;
 	/*
 	 * Add any other valid namespace prefixes here
 	 */
@@ -124,7 +124,7 @@ static inline int is_os2_xattr(struct jfs_ea *ea)
 	/*
 	 * We assume it's OS/2's flat namespace
 	 */
-	return TRUE;
+	return true;
 }
 
 static inline int name_size(struct jfs_ea *ea)
@@ -155,9 +155,9 @@ static void ea_release(struct inode *inode, struct ea_buffer *ea_buf);
 
 /*
  * NAME: ea_write_inline
- *                                                                    
+ *
  * FUNCTION: Attempt to write an EA inline if area is available
- *                                                                    
+ *
  * PRE CONDITIONS:
  *	Already verified that the specified EA is small enough to fit inline
  *
@@ -216,10 +216,10 @@ static int ea_write_inline(struct inode *ip, struct jfs_ea_list *ealist,
 
 /*
  * NAME: ea_write
- *                                                                    
+ *
  * FUNCTION: Write an EA for an inode
- *                                                                    
- * PRE CONDITIONS: EA has been verified 
+ *
+ * PRE CONDITIONS: EA has been verified
  *
  * PARAMETERS:
  *	ip	- Inode pointer
@@ -340,9 +340,9 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
 
 /*
  * NAME: ea_read_inline
- *                                                                    
+ *
  * FUNCTION: Read an inlined EA into user's buffer
- *                                                                    
+ *
  * PARAMETERS:
  *	ip	- Inode pointer
  *	ealist	- Pointer to buffer to fill in with EA
@@ -372,9 +372,9 @@ static int ea_read_inline(struct inode *ip, struct jfs_ea_list *ealist)
 
 /*
  * NAME: ea_read
- *                                                                    
+ *
  * FUNCTION: copy EA data into user's buffer
- *                                                                    
+ *
  * PARAMETERS:
  *	ip	- Inode pointer
  *	ealist	- Pointer to buffer to fill in with EA
@@ -406,7 +406,7 @@ static int ea_read(struct inode *ip, struct jfs_ea_list *ealist)
 		return -EIO;
 	}
 
-	/* 
+	/*
 	 * Figure out how many blocks were allocated when this EA list was
 	 * originally written to disk.
 	 */
@@ -443,14 +443,14 @@ static int ea_read(struct inode *ip, struct jfs_ea_list *ealist)
 
 /*
  * NAME: ea_get
- *                                                                    
+ *
  * FUNCTION: Returns buffer containing existing extended attributes.
  *	     The size of the buffer will be the larger of the existing
  *	     attributes size, or min_size.
  *
  *	     The buffer, which may be inlined in the inode or in the
- * 	     page cache must be release by calling ea_release or ea_put
- *                                                                    
+ *	     page cache must be release by calling ea_release or ea_put
+ *
  * PARAMETERS:
  *	inode	- Inode pointer
  *	ea_buf	- Structure to be populated with ealist and its metadata
@@ -1054,7 +1054,7 @@ ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size)
 
 	/* compute required size of list */
 	for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) {
-	    	if (can_list(ea))
+		if (can_list(ea))
 			size += name_size(ea) + 1;
 	}
 
@@ -1069,7 +1069,7 @@ ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size)
 	/* Copy attribute names to buffer */
 	buffer = data;
 	for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) {
-	    	if (can_list(ea)) {
+		if (can_list(ea)) {
 			int namelen = copy_name(buffer, ea);
 			buffer += namelen + 1;
 		}
diff --git a/fs/libfs.c b/fs/libfs.c
index ac02ea602c3d..bd08e0e64a8c 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -243,7 +243,7 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
 	struct inode *inode = old_dentry->d_inode;
 
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-	inode->i_nlink++;
+	inc_nlink(inode);
 	atomic_inc(&inode->i_count);
 	dget(dentry);
 	d_instantiate(dentry, inode);
@@ -275,7 +275,7 @@ int simple_unlink(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-	inode->i_nlink--;
+	drop_nlink(inode);
 	dput(dentry);
 	return 0;
 }
@@ -285,9 +285,9 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!simple_empty(dentry))
 		return -ENOTEMPTY;
 
-	dentry->d_inode->i_nlink--;
+	drop_nlink(dentry->d_inode);
 	simple_unlink(dir, dentry);
-	dir->i_nlink--;
+	drop_nlink(dir);
 	return 0;
 }
 
@@ -303,10 +303,10 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (new_dentry->d_inode) {
 		simple_unlink(new_dir, new_dentry);
 		if (they_are_dirs)
-			old_dir->i_nlink--;
+			drop_nlink(old_dir);
 	} else if (they_are_dirs) {
-		old_dir->i_nlink--;
-		new_dir->i_nlink++;
+		drop_nlink(old_dir);
+		inc_nlink(new_dir);
 	}
 
 	old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime =
@@ -317,17 +317,9 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 int simple_readpage(struct file *file, struct page *page)
 {
-	void *kaddr;
-
-	if (PageUptodate(page))
-		goto out;
-
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr, 0, PAGE_CACHE_SIZE);
-	kunmap_atomic(kaddr, KM_USER0);
+	clear_highpage(page);
 	flush_dcache_page(page);
 	SetPageUptodate(page);
-out:
 	unlock_page(page);
 	return 0;
 }
@@ -383,7 +375,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
 		return -ENOMEM;
 	inode->i_mode = S_IFDIR | 0755;
 	inode->i_uid = inode->i_gid = 0;
-	inode->i_blksize = PAGE_CACHE_SIZE;
 	inode->i_blocks = 0;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	inode->i_op = &simple_dir_inode_operations;
@@ -405,7 +396,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
 			goto out;
 		inode->i_mode = S_IFREG | files->mode;
 		inode->i_uid = inode->i_gid = 0;
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_fop = files->ops;
@@ -547,7 +537,7 @@ int simple_attr_open(struct inode *inode, struct file *file,
 
 	attr->get = get;
 	attr->set = set;
-	attr->data = inode->u.generic_ip;
+	attr->data = inode->i_private;
 	attr->fmt = fmt;
 	mutex_init(&attr->mutex);
 
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 52774feab93f..b85a0ad2cfb6 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -100,12 +100,12 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
 /*
  * The server lockd has called us back to tell us the lock was granted
  */
-u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
+__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
 {
 	const struct file_lock *fl = &lock->fl;
 	const struct nfs_fh *fh = &lock->fh;
 	struct nlm_wait	*block;
-	u32 res = nlm_lck_denied;
+	__be32 res = nlm_lck_denied;
 
 	/*
 	 * Look up blocked request based on arguments. 
@@ -144,42 +144,12 @@ u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
  */
 
 /*
- * Someone has sent us an SM_NOTIFY. Ensure we bind to the new port number,
- * that we mark locks for reclaiming, and that we bump the pseudo NSM state.
- */
-static void nlmclnt_prepare_reclaim(struct nlm_host *host)
-{
-	down_write(&host->h_rwsem);
-	host->h_monitored = 0;
-	host->h_state++;
-	host->h_nextrebind = 0;
-	nlm_rebind_host(host);
-
-	/*
-	 * Mark the locks for reclaiming.
-	 */
-	list_splice_init(&host->h_granted, &host->h_reclaim);
-
-	dprintk("NLM: reclaiming locks for host %s", host->h_name);
-}
-
-static void nlmclnt_finish_reclaim(struct nlm_host *host)
-{
-	host->h_reclaiming = 0;
-	up_write(&host->h_rwsem);
-	dprintk("NLM: done reclaiming locks for host %s", host->h_name);
-}
-
-/*
  * Reclaim all locks on server host. We do this by spawning a separate
  * reclaimer thread.
  */
 void
-nlmclnt_recovery(struct nlm_host *host, u32 newstate)
+nlmclnt_recovery(struct nlm_host *host)
 {
-	if (host->h_nsmstate == newstate)
-		return;
-	host->h_nsmstate = newstate;
 	if (!host->h_reclaiming++) {
 		nlm_get_host(host);
 		__module_get(THIS_MODULE);
@@ -199,18 +169,30 @@ reclaimer(void *ptr)
 	daemonize("%s-reclaim", host->h_name);
 	allow_signal(SIGKILL);
 
+	down_write(&host->h_rwsem);
+
 	/* This one ensures that our parent doesn't terminate while the
 	 * reclaim is in progress */
 	lock_kernel();
-	lockd_up();
+	lockd_up(0); /* note: this cannot fail as lockd is already running */
+
+	dprintk("lockd: reclaiming locks for host %s", host->h_name);
 
-	nlmclnt_prepare_reclaim(host);
-	/* First, reclaim all locks that have been marked. */
 restart:
 	nsmstate = host->h_nsmstate;
+
+	/* Force a portmap getport - the peer's lockd will
+	 * most likely end up on a different port.
+	 */
+	host->h_nextrebind = jiffies;
+	nlm_rebind_host(host);
+
+	/* First, reclaim all locks that have been granted. */
+	list_splice_init(&host->h_granted, &host->h_reclaim);
 	list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) {
 		list_del_init(&fl->fl_u.nfs_fl.list);
 
+		/* Why are we leaking memory here? --okir */
 		if (signalled())
 			continue;
 		if (nlmclnt_reclaim(host, fl) != 0)
@@ -218,11 +200,13 @@ restart:
 		list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted);
 		if (host->h_nsmstate != nsmstate) {
 			/* Argh! The server rebooted again! */
-			list_splice_init(&host->h_granted, &host->h_reclaim);
 			goto restart;
 		}
 	}
-	nlmclnt_finish_reclaim(host);
+
+	host->h_reclaiming = 0;
+	up_write(&host->h_rwsem);
+	dprintk("NLM: done reclaiming locks for host %s", host->h_name);
 
 	/* Now, wake up all processes that sleep on a blocked lock */
 	list_for_each_entry(block, &nlm_blocked, b_list) {
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 50dbb67ae0c4..3d84f600b633 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -36,14 +36,14 @@ static const struct rpc_call_ops nlmclnt_cancel_ops;
 /*
  * Cookie counter for NLM requests
  */
-static u32	nlm_cookie = 0x1234;
+static atomic_t	nlm_cookie = ATOMIC_INIT(0x1234);
 
-static inline void nlmclnt_next_cookie(struct nlm_cookie *c)
+void nlmclnt_next_cookie(struct nlm_cookie *c)
 {
-	memcpy(c->data, &nlm_cookie, 4);
-	memset(c->data+4, 0, 4);
+	u32	cookie = atomic_inc_return(&nlm_cookie);
+
+	memcpy(c->data, &cookie, 4);
 	c->len=4;
-	nlm_cookie++;
 }
 
 static struct nlm_lockowner *nlm_get_lockowner(struct nlm_lockowner *lockowner)
@@ -100,7 +100,7 @@ static struct nlm_lockowner *nlm_find_lockowner(struct nlm_host *host, fl_owner_
 	res = __nlm_find_lockowner(host, owner);
 	if (res == NULL) {
 		spin_unlock(&host->h_lock);
-		new = (struct nlm_lockowner *)kmalloc(sizeof(*new), GFP_KERNEL);
+		new = kmalloc(sizeof(*new), GFP_KERNEL);
 		spin_lock(&host->h_lock);
 		res = __nlm_find_lockowner(host, owner);
 		if (res == NULL && new != NULL) {
@@ -129,11 +129,11 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
 	nlmclnt_next_cookie(&argp->cookie);
 	argp->state   = nsm_local_state;
 	memcpy(&lock->fh, NFS_FH(fl->fl_file->f_dentry->d_inode), sizeof(struct nfs_fh));
-	lock->caller  = system_utsname.nodename;
+	lock->caller  = utsname()->nodename;
 	lock->oh.data = req->a_owner;
 	lock->oh.len  = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s",
 				(unsigned int)fl->fl_u.nfs_fl.owner->pid,
-				system_utsname.nodename);
+				utsname()->nodename);
 	lock->svid = fl->fl_u.nfs_fl.owner->pid;
 	lock->fl.fl_start = fl->fl_start;
 	lock->fl.fl_end = fl->fl_end;
@@ -153,6 +153,7 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
 {
 	struct rpc_clnt		*client = NFS_CLIENT(inode);
 	struct sockaddr_in	addr;
+	struct nfs_server	*nfssrv = NFS_SERVER(inode);
 	struct nlm_host		*host;
 	struct nlm_rqst		*call;
 	sigset_t		oldset;
@@ -166,7 +167,9 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl)
 	}
 
 	rpc_peeraddr(client, (struct sockaddr *) &addr, sizeof(addr));
-	host = nlmclnt_lookup_host(&addr, client->cl_xprt->prot, vers);
+	host = nlmclnt_lookup_host(&addr, client->cl_xprt->prot, vers,
+				   nfssrv->nfs_client->cl_hostname,
+				   strlen(nfssrv->nfs_client->cl_hostname));
 	if (host == NULL)
 		return -ENOLCK;
 
@@ -499,7 +502,7 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 	unsigned char fl_flags = fl->fl_flags;
 	int status = -ENOLCK;
 
-	if (!host->h_monitored && nsm_monitor(host) < 0) {
+	if (nsm_monitor(host) < 0) {
 		printk(KERN_NOTICE "lockd: failed to monitor %s\n",
 					host->h_name);
 		goto out;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 703fb038c813..fb24a9730345 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -27,46 +27,60 @@
 #define NLM_HOST_EXPIRE		((nrhosts > NLM_HOST_MAX)? 300 * HZ : 120 * HZ)
 #define NLM_HOST_COLLECT	((nrhosts > NLM_HOST_MAX)? 120 * HZ :  60 * HZ)
 
-static struct nlm_host *	nlm_hosts[NLM_HOST_NRHASH];
+static struct hlist_head	nlm_hosts[NLM_HOST_NRHASH];
 static unsigned long		next_gc;
 static int			nrhosts;
 static DEFINE_MUTEX(nlm_host_mutex);
 
 
 static void			nlm_gc_hosts(void);
+static struct nsm_handle *	__nsm_find(const struct sockaddr_in *,
+					const char *, int, int);
 
 /*
  * Find an NLM server handle in the cache. If there is none, create it.
  */
 struct nlm_host *
-nlmclnt_lookup_host(struct sockaddr_in *sin, int proto, int version)
+nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version,
+			const char *hostname, int hostname_len)
 {
-	return nlm_lookup_host(0, sin, proto, version);
+	return nlm_lookup_host(0, sin, proto, version,
+			       hostname, hostname_len);
 }
 
 /*
  * Find an NLM client handle in the cache. If there is none, create it.
  */
 struct nlm_host *
-nlmsvc_lookup_host(struct svc_rqst *rqstp)
+nlmsvc_lookup_host(struct svc_rqst *rqstp,
+			const char *hostname, int hostname_len)
 {
 	return nlm_lookup_host(1, &rqstp->rq_addr,
-			       rqstp->rq_prot, rqstp->rq_vers);
+			       rqstp->rq_prot, rqstp->rq_vers,
+			       hostname, hostname_len);
 }
 
 /*
  * Common host lookup routine for server & client
  */
 struct nlm_host *
-nlm_lookup_host(int server, struct sockaddr_in *sin,
-					int proto, int version)
+nlm_lookup_host(int server, const struct sockaddr_in *sin,
+					int proto, int version,
+					const char *hostname,
+					int hostname_len)
 {
-	struct nlm_host	*host, **hp;
-	u32		addr;
+	struct hlist_head *chain;
+	struct hlist_node *pos;
+	struct nlm_host	*host;
+	struct nsm_handle *nsm = NULL;
 	int		hash;
 
-	dprintk("lockd: nlm_lookup_host(%08x, p=%d, v=%d)\n",
-			(unsigned)(sin? ntohl(sin->sin_addr.s_addr) : 0), proto, version);
+	dprintk("lockd: nlm_lookup_host(%u.%u.%u.%u, p=%d, v=%d, my role=%s, name=%.*s)\n",
+			NIPQUAD(sin->sin_addr.s_addr), proto, version,
+			server? "server" : "client",
+			hostname_len,
+			hostname? hostname : "<none>");
+
 
 	hash = NLM_ADDRHASH(sin->sin_addr.s_addr);
 
@@ -76,7 +90,22 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
 	if (time_after_eq(jiffies, next_gc))
 		nlm_gc_hosts();
 
-	for (hp = &nlm_hosts[hash]; (host = *hp) != 0; hp = &host->h_next) {
+	/* We may keep several nlm_host objects for a peer, because each
+	 * nlm_host is identified by
+	 * (address, protocol, version, server/client)
+	 * We could probably simplify this a little by putting all those
+	 * different NLM rpc_clients into one single nlm_host object.
+	 * This would allow us to have one nlm_host per address.
+	 */
+	chain = &nlm_hosts[hash];
+	hlist_for_each_entry(host, pos, chain, h_hash) {
+		if (!nlm_cmp_addr(&host->h_addr, sin))
+			continue;
+
+		/* See if we have an NSM handle for this client */
+		if (!nsm)
+			nsm = host->h_nsmhandle;
+
 		if (host->h_proto != proto)
 			continue;
 		if (host->h_version != version)
@@ -84,28 +113,30 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
 		if (host->h_server != server)
 			continue;
 
-		if (nlm_cmp_addr(&host->h_addr, sin)) {
-			if (hp != nlm_hosts + hash) {
-				*hp = host->h_next;
-				host->h_next = nlm_hosts[hash];
-				nlm_hosts[hash] = host;
-			}
-			nlm_get_host(host);
-			mutex_unlock(&nlm_host_mutex);
-			return host;
-		}
-	}
+		/* Move to head of hash chain. */
+		hlist_del(&host->h_hash);
+		hlist_add_head(&host->h_hash, chain);
 
-	/* Ooops, no host found, create it */
-	dprintk("lockd: creating host entry\n");
+		nlm_get_host(host);
+		goto out;
+	}
+	if (nsm)
+		atomic_inc(&nsm->sm_count);
 
-	if (!(host = (struct nlm_host *) kmalloc(sizeof(*host), GFP_KERNEL)))
-		goto nohost;
-	memset(host, 0, sizeof(*host));
+	host = NULL;
 
-	addr = sin->sin_addr.s_addr;
-	sprintf(host->h_name, "%u.%u.%u.%u", NIPQUAD(addr));
+	/* Sadly, the host isn't in our hash table yet. See if
+	 * we have an NSM handle for it. If not, create one.
+	 */
+	if (!nsm && !(nsm = nsm_find(sin, hostname, hostname_len)))
+		goto out;
 
+	host = kzalloc(sizeof(*host), GFP_KERNEL);
+	if (!host) {
+		nsm_release(nsm);
+		goto out;
+	}
+	host->h_name	   = nsm->sm_name;
 	host->h_addr       = *sin;
 	host->h_addr.sin_port = 0;	/* ouch! */
 	host->h_version    = version;
@@ -119,9 +150,9 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
 	init_rwsem(&host->h_rwsem);
 	host->h_state      = 0;			/* pseudo NSM state */
 	host->h_nsmstate   = 0;			/* real NSM state */
+	host->h_nsmhandle  = nsm;
 	host->h_server	   = server;
-	host->h_next       = nlm_hosts[hash];
-	nlm_hosts[hash]    = host;
+	hlist_add_head(&host->h_hash, chain);
 	INIT_LIST_HEAD(&host->h_lockowners);
 	spin_lock_init(&host->h_lock);
 	INIT_LIST_HEAD(&host->h_granted);
@@ -130,35 +161,39 @@ nlm_lookup_host(int server, struct sockaddr_in *sin,
 	if (++nrhosts > NLM_HOST_MAX)
 		next_gc = 0;
 
-nohost:
+out:
 	mutex_unlock(&nlm_host_mutex);
 	return host;
 }
 
-struct nlm_host *
-nlm_find_client(void)
+/*
+ * Destroy a host
+ */
+static void
+nlm_destroy_host(struct nlm_host *host)
 {
-	/* find a nlm_host for a client for which h_killed == 0.
-	 * and return it
+	struct rpc_clnt	*clnt;
+
+	BUG_ON(!list_empty(&host->h_lockowners));
+	BUG_ON(atomic_read(&host->h_count));
+
+	/*
+	 * Release NSM handle and unmonitor host.
 	 */
-	int hash;
-	mutex_lock(&nlm_host_mutex);
-	for (hash = 0 ; hash < NLM_HOST_NRHASH; hash++) {
-		struct nlm_host *host, **hp;
-		for (hp = &nlm_hosts[hash]; (host = *hp) != 0; hp = &host->h_next) {
-			if (host->h_server &&
-			    host->h_killed == 0) {
-				nlm_get_host(host);
-				mutex_unlock(&nlm_host_mutex);
-				return host;
-			}
+	nsm_unmonitor(host);
+
+	if ((clnt = host->h_rpcclnt) != NULL) {
+		if (atomic_read(&clnt->cl_users)) {
+			printk(KERN_WARNING
+				"lockd: active RPC handle\n");
+			clnt->cl_dead = 1;
+		} else {
+			rpc_destroy_client(host->h_rpcclnt);
 		}
 	}
-	mutex_unlock(&nlm_host_mutex);
-	return NULL;
+	kfree(host);
 }
 
-				
 /*
  * Create the NLM RPC client for an NLM peer
  */
@@ -260,22 +295,82 @@ void nlm_release_host(struct nlm_host *host)
 }
 
 /*
+ * We were notified that the host indicated by address &sin
+ * has rebooted.
+ * Release all resources held by that peer.
+ */
+void nlm_host_rebooted(const struct sockaddr_in *sin,
+				const char *hostname, int hostname_len,
+				u32 new_state)
+{
+	struct hlist_head *chain;
+	struct hlist_node *pos;
+	struct nsm_handle *nsm;
+	struct nlm_host	*host;
+
+	dprintk("lockd: nlm_host_rebooted(%s, %u.%u.%u.%u)\n",
+			hostname, NIPQUAD(sin->sin_addr));
+
+	/* Find the NSM handle for this peer */
+	if (!(nsm = __nsm_find(sin, hostname, hostname_len, 0)))
+		return;
+
+	/* When reclaiming locks on this peer, make sure that
+	 * we set up a new notification */
+	nsm->sm_monitored = 0;
+
+	/* Mark all hosts tied to this NSM state as having rebooted.
+	 * We run the loop repeatedly, because we drop the host table
+	 * lock for this.
+	 * To avoid processing a host several times, we match the nsmstate.
+	 */
+again:	mutex_lock(&nlm_host_mutex);
+	for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+		hlist_for_each_entry(host, pos, chain, h_hash) {
+			if (host->h_nsmhandle == nsm
+			 && host->h_nsmstate != new_state) {
+				host->h_nsmstate = new_state;
+				host->h_state++;
+
+				nlm_get_host(host);
+				mutex_unlock(&nlm_host_mutex);
+
+				if (host->h_server) {
+					/* We're server for this guy, just ditch
+					 * all the locks he held. */
+					nlmsvc_free_host_resources(host);
+				} else {
+					/* He's the server, initiate lock recovery. */
+					nlmclnt_recovery(host);
+				}
+
+				nlm_release_host(host);
+				goto again;
+			}
+		}
+	}
+
+	mutex_unlock(&nlm_host_mutex);
+}
+
+/*
  * Shut down the hosts module.
  * Note that this routine is called only at server shutdown time.
  */
 void
 nlm_shutdown_hosts(void)
 {
+	struct hlist_head *chain;
+	struct hlist_node *pos;
 	struct nlm_host	*host;
-	int		i;
 
 	dprintk("lockd: shutting down host module\n");
 	mutex_lock(&nlm_host_mutex);
 
 	/* First, make all hosts eligible for gc */
 	dprintk("lockd: nuking all hosts...\n");
-	for (i = 0; i < NLM_HOST_NRHASH; i++) {
-		for (host = nlm_hosts[i]; host; host = host->h_next)
+	for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+		hlist_for_each_entry(host, pos, chain, h_hash)
 			host->h_expires = jiffies - 1;
 	}
 
@@ -287,8 +382,8 @@ nlm_shutdown_hosts(void)
 	if (nrhosts) {
 		printk(KERN_WARNING "lockd: couldn't shutdown host module!\n");
 		dprintk("lockd: %d hosts left:\n", nrhosts);
-		for (i = 0; i < NLM_HOST_NRHASH; i++) {
-			for (host = nlm_hosts[i]; host; host = host->h_next) {
+		for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+			hlist_for_each_entry(host, pos, chain, h_hash) {
 				dprintk("       %s (cnt %d use %d exp %ld)\n",
 					host->h_name, atomic_read(&host->h_count),
 					host->h_inuse, host->h_expires);
@@ -305,45 +400,32 @@ nlm_shutdown_hosts(void)
 static void
 nlm_gc_hosts(void)
 {
-	struct nlm_host	**q, *host;
-	struct rpc_clnt	*clnt;
-	int		i;
+	struct hlist_head *chain;
+	struct hlist_node *pos, *next;
+	struct nlm_host	*host;
 
 	dprintk("lockd: host garbage collection\n");
-	for (i = 0; i < NLM_HOST_NRHASH; i++) {
-		for (host = nlm_hosts[i]; host; host = host->h_next)
+	for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+		hlist_for_each_entry(host, pos, chain, h_hash)
 			host->h_inuse = 0;
 	}
 
 	/* Mark all hosts that hold locks, blocks or shares */
 	nlmsvc_mark_resources();
 
-	for (i = 0; i < NLM_HOST_NRHASH; i++) {
-		q = &nlm_hosts[i];
-		while ((host = *q) != NULL) {
+	for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
+		hlist_for_each_entry_safe(host, pos, next, chain, h_hash) {
 			if (atomic_read(&host->h_count) || host->h_inuse
 			 || time_before(jiffies, host->h_expires)) {
 				dprintk("nlm_gc_hosts skipping %s (cnt %d use %d exp %ld)\n",
 					host->h_name, atomic_read(&host->h_count),
 					host->h_inuse, host->h_expires);
-				q = &host->h_next;
 				continue;
 			}
 			dprintk("lockd: delete host %s\n", host->h_name);
-			*q = host->h_next;
-			/* Don't unmonitor hosts that have been invalidated */
-			if (host->h_monitored && !host->h_killed)
-				nsm_unmonitor(host);
-			if ((clnt = host->h_rpcclnt) != NULL) {
-				if (atomic_read(&clnt->cl_users)) {
-					printk(KERN_WARNING
-						"lockd: active RPC handle\n");
-					clnt->cl_dead = 1;
-				} else {
-					rpc_destroy_client(host->h_rpcclnt);
-				}
-			}
-			kfree(host);
+			hlist_del_init(&host->h_hash);
+
+			nlm_destroy_host(host);
 			nrhosts--;
 		}
 	}
@@ -351,3 +433,88 @@ nlm_gc_hosts(void)
 	next_gc = jiffies + NLM_HOST_COLLECT;
 }
 
+
+/*
+ * Manage NSM handles
+ */
+static LIST_HEAD(nsm_handles);
+static DEFINE_MUTEX(nsm_mutex);
+
+static struct nsm_handle *
+__nsm_find(const struct sockaddr_in *sin,
+		const char *hostname, int hostname_len,
+		int create)
+{
+	struct nsm_handle *nsm = NULL;
+	struct list_head *pos;
+
+	if (!sin)
+		return NULL;
+
+	if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
+		if (printk_ratelimit()) {
+			printk(KERN_WARNING "Invalid hostname \"%.*s\" "
+					    "in NFS lock request\n",
+				hostname_len, hostname);
+		}
+		return NULL;
+	}
+
+	mutex_lock(&nsm_mutex);
+	list_for_each(pos, &nsm_handles) {
+		nsm = list_entry(pos, struct nsm_handle, sm_link);
+
+		if (hostname && nsm_use_hostnames) {
+			if (strlen(nsm->sm_name) != hostname_len
+			 || memcmp(nsm->sm_name, hostname, hostname_len))
+				continue;
+		} else if (!nlm_cmp_addr(&nsm->sm_addr, sin))
+			continue;
+		atomic_inc(&nsm->sm_count);
+		goto out;
+	}
+
+	if (!create) {
+		nsm = NULL;
+		goto out;
+	}
+
+	nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL);
+	if (nsm != NULL) {
+		nsm->sm_addr = *sin;
+		nsm->sm_name = (char *) (nsm + 1);
+		memcpy(nsm->sm_name, hostname, hostname_len);
+		nsm->sm_name[hostname_len] = '\0';
+		atomic_set(&nsm->sm_count, 1);
+
+		list_add(&nsm->sm_link, &nsm_handles);
+	}
+
+out:
+	mutex_unlock(&nsm_mutex);
+	return nsm;
+}
+
+struct nsm_handle *
+nsm_find(const struct sockaddr_in *sin, const char *hostname, int hostname_len)
+{
+	return __nsm_find(sin, hostname, hostname_len, 1);
+}
+
+/*
+ * Release an NSM handle
+ */
+void
+nsm_release(struct nsm_handle *nsm)
+{
+	if (!nsm)
+		return;
+	if (atomic_dec_and_test(&nsm->sm_count)) {
+		mutex_lock(&nsm_mutex);
+		if (atomic_read(&nsm->sm_count) == 0) {
+			list_del(&nsm->sm_link);
+			kfree(nsm);
+		}
+		mutex_unlock(&nsm_mutex);
+	}
+}
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 5954dcb497e4..eb243edf8932 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -24,13 +24,13 @@ static struct rpc_program	nsm_program;
 /*
  * Local NSM state
  */
-u32				nsm_local_state;
+int				nsm_local_state;
 
 /*
  * Common procedure for SM_MON/SM_UNMON calls
  */
 static int
-nsm_mon_unmon(struct nlm_host *host, u32 proc, struct nsm_res *res)
+nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
 {
 	struct rpc_clnt	*clnt;
 	int		status;
@@ -46,10 +46,11 @@ nsm_mon_unmon(struct nlm_host *host, u32 proc, struct nsm_res *res)
 		goto out;
 	}
 
-	args.addr = host->h_addr.sin_addr.s_addr;
-	args.proto= (host->h_proto<<1) | host->h_server;
+	memset(&args, 0, sizeof(args));
+	args.mon_name = nsm->sm_name;
+	args.addr = nsm->sm_addr.sin_addr.s_addr;
 	args.prog = NLM_PROGRAM;
-	args.vers = host->h_version;
+	args.vers = 3;
 	args.proc = NLMPROC_NSM_NOTIFY;
 	memset(res, 0, sizeof(*res));
 
@@ -70,17 +71,22 @@ nsm_mon_unmon(struct nlm_host *host, u32 proc, struct nsm_res *res)
 int
 nsm_monitor(struct nlm_host *host)
 {
+	struct nsm_handle *nsm = host->h_nsmhandle;
 	struct nsm_res	res;
 	int		status;
 
 	dprintk("lockd: nsm_monitor(%s)\n", host->h_name);
+	BUG_ON(nsm == NULL);
 
-	status = nsm_mon_unmon(host, SM_MON, &res);
+	if (nsm->sm_monitored)
+		return 0;
+
+	status = nsm_mon_unmon(nsm, SM_MON, &res);
 
 	if (status < 0 || res.status != 0)
 		printk(KERN_NOTICE "lockd: cannot monitor %s\n", host->h_name);
 	else
-		host->h_monitored = 1;
+		nsm->sm_monitored = 1;
 	return status;
 }
 
@@ -90,16 +96,26 @@ nsm_monitor(struct nlm_host *host)
 int
 nsm_unmonitor(struct nlm_host *host)
 {
+	struct nsm_handle *nsm = host->h_nsmhandle;
 	struct nsm_res	res;
-	int		status;
-
-	dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name);
-
-	status = nsm_mon_unmon(host, SM_UNMON, &res);
-	if (status < 0)
-		printk(KERN_NOTICE "lockd: cannot unmonitor %s\n", host->h_name);
-	else
-		host->h_monitored = 0;
+	int		status = 0;
+
+	if (nsm == NULL)
+		return 0;
+	host->h_nsmhandle = NULL;
+
+	if (atomic_read(&nsm->sm_count) == 1
+	 && nsm->sm_monitored && !nsm->sm_sticky) {
+		dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name);
+
+		status = nsm_mon_unmon(nsm, SM_UNMON, &res);
+		if (status < 0)
+			printk(KERN_NOTICE "lockd: cannot unmonitor %s\n",
+					host->h_name);
+		else
+			nsm->sm_monitored = 0;
+	}
+	nsm_release(nsm);
 	return status;
 }
 
@@ -132,10 +148,10 @@ nsm_create(void)
  * XDR functions for NSM.
  */
 
-static u32 *
-xdr_encode_common(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
+static __be32 *
+xdr_encode_common(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
 {
-	char	buffer[20];
+	char	buffer[20], *name;
 
 	/*
 	 * Use the dotted-quad IP address of the remote host as
@@ -143,9 +159,14 @@ xdr_encode_common(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
 	 * hostname first for whatever remote hostname it receives,
 	 * so this works alright.
 	 */
-	sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr));
-	if (!(p = xdr_encode_string(p, buffer))
-	 || !(p = xdr_encode_string(p, system_utsname.nodename)))
+	if (nsm_use_hostnames) {
+		name = argp->mon_name;
+	} else {
+		sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr));
+		name = buffer;
+	}
+	if (!(p = xdr_encode_string(p, name))
+	 || !(p = xdr_encode_string(p, utsname()->nodename)))
 		return ERR_PTR(-EIO);
 	*p++ = htonl(argp->prog);
 	*p++ = htonl(argp->vers);
@@ -155,21 +176,23 @@ xdr_encode_common(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
 }
 
 static int
-xdr_encode_mon(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
+xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
 {
 	p = xdr_encode_common(rqstp, p, argp);
 	if (IS_ERR(p))
 		return PTR_ERR(p);
+
+	/* Surprise - there may even be room for an IPv6 address now */
 	*p++ = argp->addr;
-	*p++ = argp->vers;
-	*p++ = argp->proto;
+	*p++ = 0;
+	*p++ = 0;
 	*p++ = 0;
 	rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
 	return 0;
 }
 
 static int
-xdr_encode_unmon(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
+xdr_encode_unmon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
 {
 	p = xdr_encode_common(rqstp, p, argp);
 	if (IS_ERR(p))
@@ -179,7 +202,7 @@ xdr_encode_unmon(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp)
 }
 
 static int
-xdr_decode_stat_res(struct rpc_rqst *rqstp, u32 *p, struct nsm_res *resp)
+xdr_decode_stat_res(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
 {
 	resp->status = ntohl(*p++);
 	resp->state = ntohl(*p++);
@@ -189,7 +212,7 @@ xdr_decode_stat_res(struct rpc_rqst *rqstp, u32 *p, struct nsm_res *resp)
 }
 
 static int
-xdr_decode_stat(struct rpc_rqst *rqstp, u32 *p, struct nsm_res *resp)
+xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
 {
 	resp->state = ntohl(*p++);
 	return 0;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 9a991b52c647..634139232aaf 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -31,7 +31,9 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcsock.h>
+#include <net/ip.h>
 #include <linux/lockd/lockd.h>
+#include <linux/lockd/sm_inter.h>
 #include <linux/nfs.h>
 
 #define NLMDBG_FACILITY		NLMDBG_SVC
@@ -46,6 +48,7 @@ EXPORT_SYMBOL(nlmsvc_ops);
 static DEFINE_MUTEX(nlmsvc_mutex);
 static unsigned int		nlmsvc_users;
 static pid_t			nlmsvc_pid;
+static struct svc_serv		*nlmsvc_serv;
 int				nlmsvc_grace_period;
 unsigned long			nlmsvc_timeout;
 
@@ -59,6 +62,7 @@ static DECLARE_WAIT_QUEUE_HEAD(lockd_exit);
 static unsigned long		nlm_grace_period;
 static unsigned long		nlm_timeout = LOCKD_DFLT_TIMEO;
 static int			nlm_udpport, nlm_tcpport;
+int				nsm_use_hostnames = 0;
 
 /*
  * Constants needed for the sysctl interface.
@@ -96,7 +100,6 @@ static inline void clear_grace_period(void)
 static void
 lockd(struct svc_rqst *rqstp)
 {
-	struct svc_serv	*serv = rqstp->rq_server;
 	int		err = 0;
 	unsigned long grace_period_expire;
 
@@ -112,6 +115,7 @@ lockd(struct svc_rqst *rqstp)
 	 * Let our maker know we're running.
 	 */
 	nlmsvc_pid = current->pid;
+	nlmsvc_serv = rqstp->rq_server;
 	complete(&lockd_start_done);
 
 	daemonize("lockd");
@@ -161,7 +165,7 @@ lockd(struct svc_rqst *rqstp)
 		 * Find a socket with data available and call its
 		 * recvfrom routine.
 		 */
-		err = svc_recv(serv, rqstp, timeout);
+		err = svc_recv(rqstp, timeout);
 		if (err == -EAGAIN || err == -EINTR)
 			continue;
 		if (err < 0) {
@@ -174,7 +178,7 @@ lockd(struct svc_rqst *rqstp)
 		dprintk("lockd: request from %08x\n",
 			(unsigned)ntohl(rqstp->rq_addr.sin_addr.s_addr));
 
-		svc_process(serv, rqstp);
+		svc_process(rqstp);
 
 	}
 
@@ -189,6 +193,7 @@ lockd(struct svc_rqst *rqstp)
 			nlmsvc_invalidate_all();
 		nlm_shutdown_hosts();
 		nlmsvc_pid = 0;
+		nlmsvc_serv = NULL;
 	} else
 		printk(KERN_DEBUG
 			"lockd: new process, skipping host shutdown\n");
@@ -205,54 +210,77 @@ lockd(struct svc_rqst *rqstp)
 	module_put_and_exit(0);
 }
 
+
+static int find_socket(struct svc_serv *serv, int proto)
+{
+	struct svc_sock *svsk;
+	int found = 0;
+	list_for_each_entry(svsk, &serv->sv_permsocks, sk_list)
+		if (svsk->sk_sk->sk_protocol == proto) {
+			found = 1;
+			break;
+		}
+	return found;
+}
+
+static int make_socks(struct svc_serv *serv, int proto)
+{
+	/* Make any sockets that are needed but not present.
+	 * If nlm_udpport or nlm_tcpport were set as module
+	 * options, make those sockets unconditionally
+	 */
+	static int		warned;
+	int err = 0;
+	if (proto == IPPROTO_UDP || nlm_udpport)
+		if (!find_socket(serv, IPPROTO_UDP))
+			err = svc_makesock(serv, IPPROTO_UDP, nlm_udpport);
+	if (err == 0 && (proto == IPPROTO_TCP || nlm_tcpport))
+		if (!find_socket(serv, IPPROTO_TCP))
+			err= svc_makesock(serv, IPPROTO_TCP, nlm_tcpport);
+	if (!err)
+		warned = 0;
+	else if (warned++ == 0)
+		printk(KERN_WARNING
+		       "lockd_up: makesock failed, error=%d\n", err);
+	return err;
+}
+
 /*
  * Bring up the lockd process if it's not already up.
  */
 int
-lockd_up(void)
+lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
 {
-	static int		warned;
 	struct svc_serv *	serv;
 	int			error = 0;
 
 	mutex_lock(&nlmsvc_mutex);
 	/*
-	 * Unconditionally increment the user count ... this is
-	 * the number of clients who _want_ a lockd process.
-	 */
-	nlmsvc_users++; 
-	/*
 	 * Check whether we're already up and running.
 	 */
-	if (nlmsvc_pid)
+	if (nlmsvc_pid) {
+		if (proto)
+			error = make_socks(nlmsvc_serv, proto);
 		goto out;
+	}
 
 	/*
 	 * Sanity check: if there's no pid,
 	 * we should be the first user ...
 	 */
-	if (nlmsvc_users > 1)
+	if (nlmsvc_users)
 		printk(KERN_WARNING
 			"lockd_up: no pid, %d users??\n", nlmsvc_users);
 
 	error = -ENOMEM;
-	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE);
+	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
 	if (!serv) {
 		printk(KERN_WARNING "lockd_up: create service failed\n");
 		goto out;
 	}
 
-	if ((error = svc_makesock(serv, IPPROTO_UDP, nlm_udpport)) < 0 
-#ifdef CONFIG_NFSD_TCP
-	 || (error = svc_makesock(serv, IPPROTO_TCP, nlm_tcpport)) < 0
-#endif
-		) {
-		if (warned++ == 0) 
-			printk(KERN_WARNING
-				"lockd_up: makesock failed, error=%d\n", error);
+	if ((error = make_socks(serv, proto)) < 0)
 		goto destroy_and_out;
-	} 
-	warned = 0;
 
 	/*
 	 * Create the kernel thread and wait for it to start.
@@ -272,6 +300,8 @@ lockd_up(void)
 destroy_and_out:
 	svc_destroy(serv);
 out:
+	if (!error)
+		nlmsvc_users++;
 	mutex_unlock(&nlmsvc_mutex);
 	return error;
 }
@@ -367,6 +397,22 @@ static ctl_table nlm_sysctls[] = {
 		.extra1		= (int *) &nlm_port_min,
 		.extra2		= (int *) &nlm_port_max,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nsm_use_hostnames",
+		.data		= &nsm_use_hostnames,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nsm_local_state",
+		.data		= &nsm_local_state,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{ .ctl_name = 0 }
 };
 
@@ -455,6 +501,7 @@ module_param_call(nlm_udpport, param_set_port, param_get_int,
 		  &nlm_udpport, 0644);
 module_param_call(nlm_tcpport, param_set_port, param_get_int,
 		  &nlm_tcpport, 0644);
+module_param(nsm_use_hostnames, bool, 0644);
 
 /*
  * Initialising and terminating the module.
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index a2dd9ccb9b32..0ce5c81ff507 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -24,22 +24,22 @@
 /*
  * Obtain client and file from arguments
  */
-static u32
+static __be32
 nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 			struct nlm_host **hostp, struct nlm_file **filp)
 {
 	struct nlm_host		*host = NULL;
 	struct nlm_file		*file = NULL;
 	struct nlm_lock		*lock = &argp->lock;
-	u32			error = 0;
+	__be32			error = 0;
 
 	/* nfsd callbacks must have been installed for this procedure */
 	if (!nlmsvc_ops)
 		return nlm_lck_denied_nolocks;
 
 	/* Obtain host handle */
-	if (!(host = nlmsvc_lookup_host(rqstp))
-	 || (argp->monitor && !host->h_monitored && nsm_monitor(host) < 0))
+	if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len))
+	 || (argp->monitor && nsm_monitor(host) < 0))
 		goto no_locks;
 	*hostp = host;
 
@@ -68,7 +68,7 @@ no_locks:
 /*
  * NULL: Test for presence of service
  */
-static int
+static __be32
 nlm4svc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 {
 	dprintk("lockd: NULL          called\n");
@@ -78,7 +78,7 @@ nlm4svc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 /*
  * TEST: Check for conflicting lock
  */
-static int
+static __be32
 nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 				         struct nlm_res  *resp)
 {
@@ -96,7 +96,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 	/* Obtain client and file */
 	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
-		return rpc_success;
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 	/* Now check for conflicting locks */
 	resp->status = nlmsvc_testlock(file, &argp->lock, &resp->lock);
@@ -107,7 +107,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 	return rpc_success;
 }
 
-static int
+static __be32
 nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 				         struct nlm_res  *resp)
 {
@@ -126,7 +126,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 	/* Obtain client and file */
 	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
-		return rpc_success;
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 #if 0
 	/* If supplied state doesn't match current state, we assume it's
@@ -150,7 +150,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	return rpc_success;
 }
 
-static int
+static __be32
 nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
 				           struct nlm_res  *resp)
 {
@@ -169,7 +169,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 	/* Obtain client and file */
 	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
-		return rpc_success;
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 	/* Try to cancel request. */
 	resp->status = nlmsvc_cancel_blocked(file, &argp->lock);
@@ -183,7 +183,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
  * UNLOCK: release a lock
  */
-static int
+static __be32
 nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
 				           struct nlm_res  *resp)
 {
@@ -202,7 +202,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 	/* Obtain client and file */
 	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
-		return rpc_success;
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 	/* Now try to remove the lock */
 	resp->status = nlmsvc_unlock(file, &argp->lock);
@@ -217,7 +217,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
  * GRANTED: A server calls us to tell that a process' lock request
  * was granted
  */
-static int
+static __be32
 nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
 				            struct nlm_res  *resp)
 {
@@ -253,14 +253,16 @@ static const struct rpc_call_ops nlm4svc_callback_ops = {
  * because we send the callback before the reply proper. I hope this
  * doesn't break any clients.
  */
-static int nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
-		int (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res  *))
+static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
+		__be32 (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res  *))
 {
 	struct nlm_host	*host;
 	struct nlm_rqst	*call;
-	int stat;
+	__be32 stat;
 
-	host = nlmsvc_lookup_host(rqstp);
+	host = nlmsvc_lookup_host(rqstp,
+				  argp->lock.caller,
+				  argp->lock.len);
 	if (host == NULL)
 		return rpc_system_err;
 
@@ -280,35 +282,35 @@ static int nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *a
 	return rpc_success;
 }
 
-static int nlm4svc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlm4svc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
 					     void	     *resp)
 {
 	dprintk("lockd: TEST_MSG      called\n");
 	return nlm4svc_callback(rqstp, NLMPROC_TEST_RES, argp, nlm4svc_proc_test);
 }
 
-static int nlm4svc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlm4svc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
 					     void	     *resp)
 {
 	dprintk("lockd: LOCK_MSG      called\n");
 	return nlm4svc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlm4svc_proc_lock);
 }
 
-static int nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlm4svc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
 					       void	       *resp)
 {
 	dprintk("lockd: CANCEL_MSG    called\n");
 	return nlm4svc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlm4svc_proc_cancel);
 }
 
-static int nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlm4svc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
                                                void            *resp)
 {
 	dprintk("lockd: UNLOCK_MSG    called\n");
 	return nlm4svc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlm4svc_proc_unlock);
 }
 
-static int nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
                                                 void            *resp)
 {
 	dprintk("lockd: GRANTED_MSG   called\n");
@@ -318,7 +320,7 @@ static int nlm4svc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *arg
 /*
  * SHARE: create a DOS share or alter existing share.
  */
-static int
+static __be32
 nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
 				          struct nlm_res  *resp)
 {
@@ -337,7 +339,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 	/* Obtain client and file */
 	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
-		return rpc_success;
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 	/* Now try to create the share */
 	resp->status = nlmsvc_share_file(host, file, argp);
@@ -351,7 +353,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
  * UNSHARE: Release a DOS share.
  */
-static int
+static __be32
 nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
 				            struct nlm_res  *resp)
 {
@@ -370,7 +372,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 	/* Obtain client and file */
 	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
-		return rpc_success;
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 	/* Now try to lock the file */
 	resp->status = nlmsvc_unshare_file(host, file, argp);
@@ -384,7 +386,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
  * NM_LOCK: Create an unmonitored lock
  */
-static int
+static __be32
 nlm4svc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 				            struct nlm_res  *resp)
 {
@@ -397,7 +399,7 @@ nlm4svc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
  * FREE_ALL: Release all locks and shares held by client
  */
-static int
+static __be32
 nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
 					     void            *resp)
 {
@@ -415,15 +417,11 @@ nlm4svc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
  * SM_NOTIFY: private callback from statd (not part of official NLM proto)
  */
-static int
+static __be32
 nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 					      void	        *resp)
 {
 	struct sockaddr_in	saddr = rqstp->rq_addr;
-	int			vers = argp->vers;
-	int			prot = argp->proto >> 1;
-
-	struct nlm_host		*host;
 
 	dprintk("lockd: SM_NOTIFY     called\n");
 	if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
@@ -438,28 +436,17 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 	/* Obtain the host pointer for this NFS server and try to
 	 * reclaim all locks we hold on this server.
 	 */
+	memset(&saddr, 0, sizeof(saddr));
 	saddr.sin_addr.s_addr = argp->addr;
+	nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
 
-	if ((argp->proto & 1)==0) {
-		if ((host = nlmclnt_lookup_host(&saddr, prot, vers)) != NULL) {
-			nlmclnt_recovery(host, argp->state);
-			nlm_release_host(host);
-		}
-	} else {
-		/* If we run on an NFS server, delete all locks held by the client */
-
-		if ((host = nlm_lookup_host(1, &saddr, prot, vers)) != NULL) {
-			nlmsvc_free_host_resources(host);
-			nlm_release_host(host);
-		}
-	}
 	return rpc_success;
 }
 
 /*
  * client sent a GRANTED_RES, let's remove the associated block
  */
-static int
+static __be32
 nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res  *argp,
                                                 void            *resp)
 {
@@ -468,7 +455,7 @@ nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res  *argp,
 
         dprintk("lockd: GRANTED_RES   called\n");
 
-        nlmsvc_grant_reply(rqstp, &argp->cookie, argp->status);
+        nlmsvc_grant_reply(&argp->cookie, argp->status);
         return rpc_success;
 }
 
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index c9d419703cf3..7e219b938552 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -40,7 +40,7 @@
 
 static void nlmsvc_release_block(struct nlm_block *block);
 static void	nlmsvc_insert_block(struct nlm_block *block, unsigned long);
-static int	nlmsvc_remove_block(struct nlm_block *block);
+static void	nlmsvc_remove_block(struct nlm_block *block);
 
 static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock);
 static void nlmsvc_freegrantargs(struct nlm_rqst *call);
@@ -49,7 +49,7 @@ static const struct rpc_call_ops nlmsvc_grant_ops;
 /*
  * The list of blocked locks to retry
  */
-static struct nlm_block *	nlm_blocked;
+static LIST_HEAD(nlm_blocked);
 
 /*
  * Insert a blocked lock into the global list
@@ -57,48 +57,44 @@ static struct nlm_block *	nlm_blocked;
 static void
 nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
 {
-	struct nlm_block **bp, *b;
+	struct nlm_block *b;
+	struct list_head *pos;
 
 	dprintk("lockd: nlmsvc_insert_block(%p, %ld)\n", block, when);
-	kref_get(&block->b_count);
-	if (block->b_queued)
-		nlmsvc_remove_block(block);
-	bp = &nlm_blocked;
+	if (list_empty(&block->b_list)) {
+		kref_get(&block->b_count);
+	} else {
+		list_del_init(&block->b_list);
+	}
+
+	pos = &nlm_blocked;
 	if (when != NLM_NEVER) {
 		if ((when += jiffies) == NLM_NEVER)
 			when ++;
-		while ((b = *bp) && time_before_eq(b->b_when,when) && b->b_when != NLM_NEVER)
-			bp = &b->b_next;
-	} else
-		while ((b = *bp) != 0)
-			bp = &b->b_next;
+		list_for_each(pos, &nlm_blocked) {
+			b = list_entry(pos, struct nlm_block, b_list);
+			if (time_after(b->b_when,when) || b->b_when == NLM_NEVER)
+				break;
+		}
+		/* On normal exit from the loop, pos == &nlm_blocked,
+		 * so we will be adding to the end of the list - good
+		 */
+	}
 
-	block->b_queued = 1;
+	list_add_tail(&block->b_list, pos);
 	block->b_when = when;
-	block->b_next = b;
-	*bp = block;
 }
 
 /*
  * Remove a block from the global list
  */
-static int
+static inline void
 nlmsvc_remove_block(struct nlm_block *block)
 {
-	struct nlm_block **bp, *b;
-
-	if (!block->b_queued)
-		return 1;
-	for (bp = &nlm_blocked; (b = *bp) != 0; bp = &b->b_next) {
-		if (b == block) {
-			*bp = block->b_next;
-			block->b_queued = 0;
-			nlmsvc_release_block(block);
-			return 1;
-		}
+	if (!list_empty(&block->b_list)) {
+		list_del_init(&block->b_list);
+		nlmsvc_release_block(block);
 	}
-
-	return 0;
 }
 
 /*
@@ -107,14 +103,14 @@ nlmsvc_remove_block(struct nlm_block *block)
 static struct nlm_block *
 nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock)
 {
-	struct nlm_block	**head, *block;
+	struct nlm_block	*block;
 	struct file_lock	*fl;
 
 	dprintk("lockd: nlmsvc_lookup_block f=%p pd=%d %Ld-%Ld ty=%d\n",
 				file, lock->fl.fl_pid,
 				(long long)lock->fl.fl_start,
 				(long long)lock->fl.fl_end, lock->fl.fl_type);
-	for (head = &nlm_blocked; (block = *head) != 0; head = &block->b_next) {
+	list_for_each_entry(block, &nlm_blocked, b_list) {
 		fl = &block->b_call->a_args.lock.fl;
 		dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n",
 				block->b_file, fl->fl_pid,
@@ -143,20 +139,20 @@ static inline int nlm_cookie_match(struct nlm_cookie *a, struct nlm_cookie *b)
  * Find a block with a given NLM cookie.
  */
 static inline struct nlm_block *
-nlmsvc_find_block(struct nlm_cookie *cookie,  struct sockaddr_in *sin)
+nlmsvc_find_block(struct nlm_cookie *cookie)
 {
 	struct nlm_block *block;
 
-	for (block = nlm_blocked; block; block = block->b_next) {
-		dprintk("cookie: head of blocked queue %p, block %p\n", 
-			nlm_blocked, block);
-		if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie)
-				&& nlm_cmp_addr(sin, &block->b_host->h_addr))
-			break;
+	list_for_each_entry(block, &nlm_blocked, b_list) {
+		if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie))
+			goto found;
 	}
 
-	if (block != NULL)
-		kref_get(&block->b_count);
+	return NULL;
+
+found:
+	dprintk("nlmsvc_find_block(%s): block=%p\n", nlmdbg_cookie2a(cookie), block);
+	kref_get(&block->b_count);
 	return block;
 }
 
@@ -169,6 +165,11 @@ nlmsvc_find_block(struct nlm_cookie *cookie,  struct sockaddr_in *sin)
  * request, but (as I found out later) that's because some implementations
  * do just this. Never mind the standards comittees, they support our
  * logging industries.
+ *
+ * 10 years later: I hope we can safely ignore these old and broken
+ * clients by now. Let's fix this so we can uniquely identify an incoming
+ * GRANTED_RES message by cookie, without having to rely on the client's IP
+ * address. --okir
  */
 static inline struct nlm_block *
 nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
@@ -179,7 +180,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
 	struct nlm_rqst		*call = NULL;
 
 	/* Create host handle for callback */
-	host = nlmsvc_lookup_host(rqstp);
+	host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len);
 	if (host == NULL)
 		return NULL;
 
@@ -192,6 +193,8 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
 	if (block == NULL)
 		goto failed;
 	kref_init(&block->b_count);
+	INIT_LIST_HEAD(&block->b_list);
+	INIT_LIST_HEAD(&block->b_flist);
 
 	if (!nlmsvc_setgrantargs(call, lock))
 		goto failed_free;
@@ -199,7 +202,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
 	/* Set notifier function for VFS, and init args */
 	call->a_args.lock.fl.fl_flags |= FL_SLEEP;
 	call->a_args.lock.fl.fl_lmops = &nlmsvc_lock_operations;
-	call->a_args.cookie = *cookie;	/* see above */
+	nlmclnt_next_cookie(&call->a_args.cookie);
 
 	dprintk("lockd: created block %p...\n", block);
 
@@ -210,8 +213,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file,
 	file->f_count++;
 
 	/* Add to file's list of blocks */
-	block->b_fnext  = file->f_blocks;
-	file->f_blocks  = block;
+	list_add(&block->b_flist, &file->f_blocks);
 
 	/* Set up RPC arguments for callback */
 	block->b_call = call;
@@ -248,19 +250,13 @@ static void nlmsvc_free_block(struct kref *kref)
 {
 	struct nlm_block *block = container_of(kref, struct nlm_block, b_count);
 	struct nlm_file		*file = block->b_file;
-	struct nlm_block	**bp;
 
 	dprintk("lockd: freeing block %p...\n", block);
 
-	down(&file->f_sema);
 	/* Remove block from file's list of blocks */
-	for (bp = &file->f_blocks; *bp; bp = &(*bp)->b_fnext) {
-		if (*bp == block) {
-			*bp = block->b_fnext;
-			break;
-		}
-	}
-	up(&file->f_sema);
+	mutex_lock(&file->f_mutex);
+	list_del_init(&block->b_flist);
+	mutex_unlock(&file->f_mutex);
 
 	nlmsvc_freegrantargs(block->b_call);
 	nlm_release_call(block->b_call);
@@ -274,47 +270,32 @@ static void nlmsvc_release_block(struct nlm_block *block)
 		kref_put(&block->b_count, nlmsvc_free_block);
 }
 
-static void nlmsvc_act_mark(struct nlm_host *host, struct nlm_file *file)
-{
-	struct nlm_block *block;
-
-	down(&file->f_sema);
-	for (block = file->f_blocks; block != NULL; block = block->b_fnext)
-		block->b_host->h_inuse = 1;
-	up(&file->f_sema);
-}
-
-static void nlmsvc_act_unlock(struct nlm_host *host, struct nlm_file *file)
+/*
+ * Loop over all blocks and delete blocks held by
+ * a matching host.
+ */
+void nlmsvc_traverse_blocks(struct nlm_host *host,
+			struct nlm_file *file,
+			nlm_host_match_fn_t match)
 {
-	struct nlm_block *block;
+	struct nlm_block *block, *next;
 
 restart:
-	down(&file->f_sema);
-	for (block = file->f_blocks; block != NULL; block = block->b_fnext) {
-		if (host != NULL && host != block->b_host)
+	mutex_lock(&file->f_mutex);
+	list_for_each_entry_safe(block, next, &file->f_blocks, b_flist) {
+		if (!match(block->b_host, host))
 			continue;
-		if (!block->b_queued)
+		/* Do not destroy blocks that are not on
+		 * the global retry list - why? */
+		if (list_empty(&block->b_list))
 			continue;
 		kref_get(&block->b_count);
-		up(&file->f_sema);
+		mutex_unlock(&file->f_mutex);
 		nlmsvc_unlink_block(block);
 		nlmsvc_release_block(block);
 		goto restart;
 	}
-	up(&file->f_sema);
-}
-
-/*
- * Loop over all blocks and perform the action specified.
- * (NLM_ACT_CHECK handled by nlmsvc_inspect_file).
- */
-void
-nlmsvc_traverse_blocks(struct nlm_host *host, struct nlm_file *file, int action)
-{
-	if (action == NLM_ACT_MARK)
-		nlmsvc_act_mark(host, file);
-	else
-		nlmsvc_act_unlock(host, file);
+	mutex_unlock(&file->f_mutex);
 }
 
 /*
@@ -325,7 +306,7 @@ static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock)
 {
 	locks_copy_lock(&call->a_args.lock.fl, &lock->fl);
 	memcpy(&call->a_args.lock.fh, &lock->fh, sizeof(call->a_args.lock.fh));
-	call->a_args.lock.caller = system_utsname.nodename;
+	call->a_args.lock.caller = utsname()->nodename;
 	call->a_args.lock.oh.len = lock->oh.len;
 
 	/* set default data area */
@@ -353,13 +334,13 @@ static void nlmsvc_freegrantargs(struct nlm_rqst *call)
  * Attempt to establish a lock, and if it can't be granted, block it
  * if required.
  */
-u32
+__be32
 nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			struct nlm_lock *lock, int wait, struct nlm_cookie *cookie)
 {
 	struct nlm_block	*block, *newblock = NULL;
 	int			error;
-	u32			ret;
+	__be32			ret;
 
 	dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n",
 				file->f_file->f_dentry->d_inode->i_sb->s_id,
@@ -373,7 +354,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 	lock->fl.fl_flags &= ~FL_SLEEP;
 again:
 	/* Lock file against concurrent access */
-	down(&file->f_sema);
+	mutex_lock(&file->f_mutex);
 	/* Get existing block (in case client is busy-waiting) */
 	block = nlmsvc_lookup_block(file, lock);
 	if (block == NULL) {
@@ -411,10 +392,10 @@ again:
 
 	/* If we don't have a block, create and initialize it. Then
 	 * retry because we may have slept in kmalloc. */
-	/* We have to release f_sema as nlmsvc_create_block may try to
+	/* We have to release f_mutex as nlmsvc_create_block may try to
 	 * to claim it while doing host garbage collection */
 	if (newblock == NULL) {
-		up(&file->f_sema);
+		mutex_unlock(&file->f_mutex);
 		dprintk("lockd: blocking on this lock (allocating).\n");
 		if (!(newblock = nlmsvc_create_block(rqstp, file, lock, cookie)))
 			return nlm_lck_denied_nolocks;
@@ -424,7 +405,7 @@ again:
 	/* Append to list of blocked */
 	nlmsvc_insert_block(newblock, NLM_NEVER);
 out:
-	up(&file->f_sema);
+	mutex_unlock(&file->f_mutex);
 	nlmsvc_release_block(newblock);
 	nlmsvc_release_block(block);
 	dprintk("lockd: nlmsvc_lock returned %u\n", ret);
@@ -434,7 +415,7 @@ out:
 /*
  * Test for presence of a conflicting lock.
  */
-u32
+__be32
 nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock,
 				       struct nlm_lock *conflock)
 {
@@ -451,6 +432,7 @@ nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock,
 				(long long)conflock->fl.fl_start,
 				(long long)conflock->fl.fl_end);
 		conflock->caller = "somehost";	/* FIXME */
+		conflock->len = strlen(conflock->caller);
 		conflock->oh.len = 0;		/* don't return OH info */
 		conflock->svid = conflock->fl.fl_pid;
 		return nlm_lck_denied;
@@ -466,7 +448,7 @@ nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock,
  * afterwards. In this case the block will still be there, and hence
  * must be removed.
  */
-u32
+__be32
 nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock)
 {
 	int	error;
@@ -494,7 +476,7 @@ nlmsvc_unlock(struct nlm_file *file, struct nlm_lock *lock)
  * be in progress.
  * The calling procedure must check whether the file can be closed.
  */
-u32
+__be32
 nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
 {
 	struct nlm_block	*block;
@@ -507,9 +489,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
 				(long long)lock->fl.fl_start,
 				(long long)lock->fl.fl_end);
 
-	down(&file->f_sema);
+	mutex_lock(&file->f_mutex);
 	block = nlmsvc_lookup_block(file, lock);
-	up(&file->f_sema);
+	mutex_unlock(&file->f_mutex);
 	if (block != NULL) {
 		status = nlmsvc_unlink_block(block);
 		nlmsvc_release_block(block);
@@ -527,10 +509,10 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
 static void
 nlmsvc_notify_blocked(struct file_lock *fl)
 {
-	struct nlm_block	**bp, *block;
+	struct nlm_block	*block;
 
 	dprintk("lockd: VFS unblock notification for block %p\n", fl);
-	for (bp = &nlm_blocked; (block = *bp) != 0; bp = &block->b_next) {
+	list_for_each_entry(block, &nlm_blocked, b_list) {
 		if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) {
 			nlmsvc_insert_block(block, 0);
 			svc_wake_up(block->b_daemon);
@@ -663,17 +645,14 @@ static const struct rpc_call_ops nlmsvc_grant_ops = {
  * block.
  */
 void
-nlmsvc_grant_reply(struct svc_rqst *rqstp, struct nlm_cookie *cookie, u32 status)
+nlmsvc_grant_reply(struct nlm_cookie *cookie, u32 status)
 {
 	struct nlm_block	*block;
-	struct nlm_file		*file;
 
-	dprintk("grant_reply: looking for cookie %x, host (%08x), s=%d \n", 
-		*(unsigned int *)(cookie->data), 
-		ntohl(rqstp->rq_addr.sin_addr.s_addr), status);
-	if (!(block = nlmsvc_find_block(cookie, &rqstp->rq_addr)))
+	dprintk("grant_reply: looking for cookie %x, s=%d \n",
+		*(unsigned int *)(cookie->data), status);
+	if (!(block = nlmsvc_find_block(cookie)))
 		return;
-	file = block->b_file;
 
 	if (block) {
 		if (status == NLM_LCK_DENIED_GRACE_PERIOD) {
@@ -696,16 +675,19 @@ nlmsvc_grant_reply(struct svc_rqst *rqstp, struct nlm_cookie *cookie, u32 status
 unsigned long
 nlmsvc_retry_blocked(void)
 {
-	struct nlm_block	*block;
+	unsigned long	timeout = MAX_SCHEDULE_TIMEOUT;
+	struct nlm_block *block;
+
+	while (!list_empty(&nlm_blocked)) {
+		block = list_entry(nlm_blocked.next, struct nlm_block, b_list);
 
-	dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n",
-			nlm_blocked,
-			nlm_blocked? nlm_blocked->b_when : 0);
-	while ((block = nlm_blocked) != 0) {
 		if (block->b_when == NLM_NEVER)
 			break;
-	        if (time_after(block->b_when,jiffies))
+	        if (time_after(block->b_when,jiffies)) {
+			timeout = block->b_when - jiffies;
 			break;
+		}
+
 		dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n",
 			block, block->b_when);
 		kref_get(&block->b_count);
@@ -713,8 +695,5 @@ nlmsvc_retry_blocked(void)
 		nlmsvc_release_block(block);
 	}
 
-	if ((block = nlm_blocked) && block->b_when != NLM_NEVER)
-		return (block->b_when - jiffies);
-
-	return MAX_SCHEDULE_TIMEOUT;
+	return timeout;
 }
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index dbb66a3b5cd9..32e99a6e8dca 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -22,8 +22,8 @@
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
 
 #ifdef CONFIG_LOCKD_V4
-static u32
-cast_to_nlm(u32 status, u32 vers)
+static __be32
+cast_to_nlm(__be32 status, u32 vers)
 {
 	/* Note: status is assumed to be in network byte order !!! */
 	if (vers != 4){
@@ -52,22 +52,22 @@ cast_to_nlm(u32 status, u32 vers)
 /*
  * Obtain client and file from arguments
  */
-static u32
+static __be32
 nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 			struct nlm_host **hostp, struct nlm_file **filp)
 {
 	struct nlm_host		*host = NULL;
 	struct nlm_file		*file = NULL;
 	struct nlm_lock		*lock = &argp->lock;
-	u32			error;
+	__be32			error = 0;
 
 	/* nfsd callbacks must have been installed for this procedure */
 	if (!nlmsvc_ops)
 		return nlm_lck_denied_nolocks;
 
 	/* Obtain host handle */
-	if (!(host = nlmsvc_lookup_host(rqstp))
-	 || (argp->monitor && !host->h_monitored && nsm_monitor(host) < 0))
+	if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len))
+	 || (argp->monitor && nsm_monitor(host) < 0))
 		goto no_locks;
 	*hostp = host;
 
@@ -88,13 +88,15 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 no_locks:
 	if (host)
 		nlm_release_host(host);
+	if (error)
+		return error;
 	return nlm_lck_denied_nolocks;
 }
 
 /*
  * NULL: Test for presence of service
  */
-static int
+static __be32
 nlmsvc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 {
 	dprintk("lockd: NULL          called\n");
@@ -104,7 +106,7 @@ nlmsvc_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 /*
  * TEST: Check for conflicting lock
  */
-static int
+static __be32
 nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 				         struct nlm_res  *resp)
 {
@@ -122,7 +124,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 	/* Obtain client and file */
 	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
-		return rpc_success;
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 	/* Now check for conflicting locks */
 	resp->status = cast_status(nlmsvc_testlock(file, &argp->lock, &resp->lock));
@@ -134,7 +136,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
 	return rpc_success;
 }
 
-static int
+static __be32
 nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 				         struct nlm_res  *resp)
 {
@@ -153,7 +155,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 	/* Obtain client and file */
 	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
-		return rpc_success;
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 #if 0
 	/* If supplied state doesn't match current state, we assume it's
@@ -177,7 +179,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 	return rpc_success;
 }
 
-static int
+static __be32
 nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
 				           struct nlm_res  *resp)
 {
@@ -196,7 +198,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 	/* Obtain client and file */
 	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
-		return rpc_success;
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 	/* Try to cancel request. */
 	resp->status = cast_status(nlmsvc_cancel_blocked(file, &argp->lock));
@@ -210,7 +212,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
  * UNLOCK: release a lock
  */
-static int
+static __be32
 nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
 				           struct nlm_res  *resp)
 {
@@ -229,7 +231,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 	/* Obtain client and file */
 	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
-		return rpc_success;
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 	/* Now try to remove the lock */
 	resp->status = cast_status(nlmsvc_unlock(file, &argp->lock));
@@ -244,7 +246,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
  * GRANTED: A server calls us to tell that a process' lock request
  * was granted
  */
-static int
+static __be32
 nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
 				            struct nlm_res  *resp)
 {
@@ -280,14 +282,16 @@ static const struct rpc_call_ops nlmsvc_callback_ops = {
  * because we send the callback before the reply proper. I hope this
  * doesn't break any clients.
  */
-static int nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
-		int (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res  *))
+static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *argp,
+		__be32 (*func)(struct svc_rqst *, struct nlm_args *, struct nlm_res  *))
 {
 	struct nlm_host	*host;
 	struct nlm_rqst	*call;
-	int stat;
+	__be32 stat;
 
-	host = nlmsvc_lookup_host(rqstp);
+	host = nlmsvc_lookup_host(rqstp,
+				  argp->lock.caller,
+				  argp->lock.len);
 	if (host == NULL)
 		return rpc_system_err;
 
@@ -307,28 +311,28 @@ static int nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *ar
 	return rpc_success;
 }
 
-static int nlmsvc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlmsvc_proc_test_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
 					     void	     *resp)
 {
 	dprintk("lockd: TEST_MSG      called\n");
 	return nlmsvc_callback(rqstp, NLMPROC_TEST_RES, argp, nlmsvc_proc_test);
 }
 
-static int nlmsvc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlmsvc_proc_lock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
 					     void	     *resp)
 {
 	dprintk("lockd: LOCK_MSG      called\n");
 	return nlmsvc_callback(rqstp, NLMPROC_LOCK_RES, argp, nlmsvc_proc_lock);
 }
 
-static int nlmsvc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
+static __be32 nlmsvc_proc_cancel_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
 					       void	       *resp)
 {
 	dprintk("lockd: CANCEL_MSG    called\n");
 	return nlmsvc_callback(rqstp, NLMPROC_CANCEL_RES, argp, nlmsvc_proc_cancel);
 }
 
-static int
+static __be32
 nlmsvc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
                                                void            *resp)
 {
@@ -336,7 +340,7 @@ nlmsvc_proc_unlock_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
 	return nlmsvc_callback(rqstp, NLMPROC_UNLOCK_RES, argp, nlmsvc_proc_unlock);
 }
 
-static int
+static __be32
 nlmsvc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
                                                 void            *resp)
 {
@@ -347,7 +351,7 @@ nlmsvc_proc_granted_msg(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
  * SHARE: create a DOS share or alter existing share.
  */
-static int
+static __be32
 nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
 				          struct nlm_res  *resp)
 {
@@ -366,7 +370,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 	/* Obtain client and file */
 	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
-		return rpc_success;
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 	/* Now try to create the share */
 	resp->status = cast_status(nlmsvc_share_file(host, file, argp));
@@ -380,7 +384,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
  * UNSHARE: Release a DOS share.
  */
-static int
+static __be32
 nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
 				            struct nlm_res  *resp)
 {
@@ -399,7 +403,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 	/* Obtain client and file */
 	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
-		return rpc_success;
+		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
 	/* Now try to unshare the file */
 	resp->status = cast_status(nlmsvc_unshare_file(host, file, argp));
@@ -413,7 +417,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
  * NM_LOCK: Create an unmonitored lock
  */
-static int
+static __be32
 nlmsvc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 				            struct nlm_res  *resp)
 {
@@ -426,7 +430,7 @@ nlmsvc_proc_nm_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
  * FREE_ALL: Release all locks and shares held by client
  */
-static int
+static __be32
 nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
 					     void            *resp)
 {
@@ -444,14 +448,11 @@ nlmsvc_proc_free_all(struct svc_rqst *rqstp, struct nlm_args *argp,
 /*
  * SM_NOTIFY: private callback from statd (not part of official NLM proto)
  */
-static int
+static __be32
 nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 					      void	        *resp)
 {
 	struct sockaddr_in	saddr = rqstp->rq_addr;
-	int			vers = argp->vers;
-	int			prot = argp->proto >> 1;
-	struct nlm_host		*host;
 
 	dprintk("lockd: SM_NOTIFY     called\n");
 	if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
@@ -466,19 +467,9 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 	/* Obtain the host pointer for this NFS server and try to
 	 * reclaim all locks we hold on this server.
 	 */
+	memset(&saddr, 0, sizeof(saddr));
 	saddr.sin_addr.s_addr = argp->addr;
-	if ((argp->proto & 1)==0) {
-		if ((host = nlmclnt_lookup_host(&saddr, prot, vers)) != NULL) {
-			nlmclnt_recovery(host, argp->state);
-			nlm_release_host(host);
-		}
-	} else {
-		/* If we run on an NFS server, delete all locks held by the client */
-		if ((host = nlm_lookup_host(1, &saddr, prot, vers)) != NULL) {
-			nlmsvc_free_host_resources(host);
-			nlm_release_host(host);
-		}
-	}
+	nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
 
 	return rpc_success;
 }
@@ -486,7 +477,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 /*
  * client sent a GRANTED_RES, let's remove the associated block
  */
-static int
+static __be32
 nlmsvc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res  *argp,
                                                 void            *resp)
 {
@@ -495,7 +486,7 @@ nlmsvc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res  *argp,
 
 	dprintk("lockd: GRANTED_RES   called\n");
 
-	nlmsvc_grant_reply(rqstp, &argp->cookie, argp->status);
+	nlmsvc_grant_reply(&argp->cookie, argp->status);
 	return rpc_success;
 }
 
diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c
index 27288c83da96..6220dc2a3f2c 100644
--- a/fs/lockd/svcshare.c
+++ b/fs/lockd/svcshare.c
@@ -23,7 +23,7 @@ nlm_cmp_owner(struct nlm_share *share, struct xdr_netobj *oh)
 	    && !memcmp(share->s_owner.data, oh->data, oh->len);
 }
 
-u32
+__be32
 nlmsvc_share_file(struct nlm_host *host, struct nlm_file *file,
 			struct nlm_args *argp)
 {
@@ -64,7 +64,7 @@ update:
 /*
  * Delete a share.
  */
-u32
+__be32
 nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file,
 			struct nlm_args *argp)
 {
@@ -85,24 +85,20 @@ nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file,
 }
 
 /*
- * Traverse all shares for a given file (and host).
- * NLM_ACT_CHECK is handled by nlmsvc_inspect_file.
+ * Traverse all shares for a given file, and delete
+ * those owned by the given (type of) host
  */
-void
-nlmsvc_traverse_shares(struct nlm_host *host, struct nlm_file *file, int action)
+void nlmsvc_traverse_shares(struct nlm_host *host, struct nlm_file *file,
+		nlm_host_match_fn_t match)
 {
 	struct nlm_share	*share, **shpp;
 
 	shpp = &file->f_shares;
 	while ((share = *shpp) !=  NULL) {
-		if (action == NLM_ACT_MARK)
-			share->s_host->h_inuse = 1;
-		else if (action == NLM_ACT_UNLOCK) {
-			if (host == NULL || host == share->s_host) {
-				*shpp = share->s_next;
-				kfree(share);
-				continue;
-			}
+		if (match(share->s_host, host)) {
+			*shpp = share->s_next;
+			kfree(share);
+			continue;
 		}
 		shpp = &share->s_next;
 	}
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 01b4db9e5466..e83024e16042 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -25,9 +25,9 @@
 /*
  * Global file hash table
  */
-#define FILE_HASH_BITS		5
+#define FILE_HASH_BITS		7
 #define FILE_NRHASH		(1<<FILE_HASH_BITS)
-static struct nlm_file *	nlm_files[FILE_NRHASH];
+static struct hlist_head	nlm_files[FILE_NRHASH];
 static DEFINE_MUTEX(nlm_file_mutex);
 
 #ifdef NFSD_DEBUG
@@ -78,13 +78,14 @@ static inline unsigned int file_hash(struct nfs_fh *f)
  * This is not quite right, but for now, we assume the client performs
  * the proper R/W checking.
  */
-u32
+__be32
 nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
 					struct nfs_fh *f)
 {
+	struct hlist_node *pos;
 	struct nlm_file	*file;
 	unsigned int	hash;
-	u32		nfserr;
+	__be32		nfserr;
 
 	nlm_debug_print_fh("nlm_file_lookup", f);
 
@@ -93,21 +94,21 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
 	/* Lock file table */
 	mutex_lock(&nlm_file_mutex);
 
-	for (file = nlm_files[hash]; file; file = file->f_next)
+	hlist_for_each_entry(file, pos, &nlm_files[hash], f_list)
 		if (!nfs_compare_fh(&file->f_handle, f))
 			goto found;
 
 	nlm_debug_print_fh("creating file for", f);
 
 	nfserr = nlm_lck_denied_nolocks;
-	file = (struct nlm_file *) kmalloc(sizeof(*file), GFP_KERNEL);
+	file = kzalloc(sizeof(*file), GFP_KERNEL);
 	if (!file)
 		goto out_unlock;
 
-	memset(file, 0, sizeof(*file));
 	memcpy(&file->f_handle, f, sizeof(struct nfs_fh));
-	file->f_hash = hash;
-	init_MUTEX(&file->f_sema);
+	mutex_init(&file->f_mutex);
+	INIT_HLIST_NODE(&file->f_list);
+	INIT_LIST_HEAD(&file->f_blocks);
 
 	/* Open the file. Note that this must not sleep for too long, else
 	 * we would lock up lockd:-) So no NFS re-exports, folks.
@@ -116,12 +117,11 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
 	 * the file.
 	 */
 	if ((nfserr = nlmsvc_ops->fopen(rqstp, f, &file->f_file)) != 0) {
-		dprintk("lockd: open failed (nfserr %d)\n", ntohl(nfserr));
+		dprintk("lockd: open failed (error %d)\n", nfserr);
 		goto out_free;
 	}
 
-	file->f_next = nlm_files[hash];
-	nlm_files[hash] = file;
+	hlist_add_head(&file->f_list, &nlm_files[hash]);
 
 found:
 	dprintk("lockd: found file %p (count %d)\n", file, file->f_count);
@@ -135,12 +135,6 @@ out_unlock:
 
 out_free:
 	kfree(file);
-#ifdef CONFIG_LOCKD_V4
-	if (nfserr == 1)
-		nfserr = nlm4_stale_fh;
-	else
-#endif
-	nfserr = nlm_lck_denied;
 	goto out_unlock;
 }
 
@@ -150,22 +144,14 @@ out_free:
 static inline void
 nlm_delete_file(struct nlm_file *file)
 {
-	struct nlm_file	**fp, *f;
-
 	nlm_debug_print_file("closing file", file);
-
-	fp = nlm_files + file->f_hash;
-	while ((f = *fp) != NULL) {
-		if (f == file) {
-			*fp = file->f_next;
-			nlmsvc_ops->fclose(file->f_file);
-			kfree(file);
-			return;
-		}
-		fp = &f->f_next;
+	if (!hlist_unhashed(&file->f_list)) {
+		hlist_del(&file->f_list);
+		nlmsvc_ops->fclose(file->f_file);
+		kfree(file);
+	} else {
+		printk(KERN_WARNING "lockd: attempt to release unknown file!\n");
 	}
-
-	printk(KERN_WARNING "lockd: attempt to release unknown file!\n");
 }
 
 /*
@@ -173,7 +159,8 @@ nlm_delete_file(struct nlm_file *file)
  * action.
  */
 static int
-nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, int action)
+nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
+			nlm_host_match_fn_t match)
 {
 	struct inode	 *inode = nlmsvc_file_inode(file);
 	struct file_lock *fl;
@@ -187,17 +174,11 @@ again:
 
 		/* update current lock count */
 		file->f_locks++;
+
 		lockhost = (struct nlm_host *) fl->fl_owner;
-		if (action == NLM_ACT_MARK)
-			lockhost->h_inuse = 1;
-		else if (action == NLM_ACT_CHECK)
-			return 1;
-		else if (action == NLM_ACT_UNLOCK) {
+		if (match(lockhost, host)) {
 			struct file_lock lock = *fl;
 
-			if (host && lockhost != host)
-				continue;
-
 			lock.fl_type  = F_UNLCK;
 			lock.fl_start = 0;
 			lock.fl_end   = OFFSET_MAX;
@@ -214,53 +195,66 @@ again:
 }
 
 /*
- * Operate on a single file
+ * Inspect a single file
+ */
+static inline int
+nlm_inspect_file(struct nlm_host *host, struct nlm_file *file, nlm_host_match_fn_t match)
+{
+	nlmsvc_traverse_blocks(host, file, match);
+	nlmsvc_traverse_shares(host, file, match);
+	return nlm_traverse_locks(host, file, match);
+}
+
+/*
+ * Quick check whether there are still any locks, blocks or
+ * shares on a given file.
  */
 static inline int
-nlm_inspect_file(struct nlm_host *host, struct nlm_file *file, int action)
+nlm_file_inuse(struct nlm_file *file)
 {
-	if (action == NLM_ACT_CHECK) {
-		/* Fast path for mark and sweep garbage collection */
-		if (file->f_count || file->f_blocks || file->f_shares)
+	struct inode	 *inode = nlmsvc_file_inode(file);
+	struct file_lock *fl;
+
+	if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
+		return 1;
+
+	for (fl = inode->i_flock; fl; fl = fl->fl_next) {
+		if (fl->fl_lmops == &nlmsvc_lock_operations)
 			return 1;
-	} else {
-		nlmsvc_traverse_blocks(host, file, action);
-		nlmsvc_traverse_shares(host, file, action);
 	}
-	return nlm_traverse_locks(host, file, action);
+	file->f_locks = 0;
+	return 0;
 }
 
 /*
  * Loop over all files in the file table.
  */
 static int
-nlm_traverse_files(struct nlm_host *host, int action)
+nlm_traverse_files(struct nlm_host *host, nlm_host_match_fn_t match)
 {
-	struct nlm_file	*file, **fp;
+	struct hlist_node *pos, *next;
+	struct nlm_file	*file;
 	int i, ret = 0;
 
 	mutex_lock(&nlm_file_mutex);
 	for (i = 0; i < FILE_NRHASH; i++) {
-		fp = nlm_files + i;
-		while ((file = *fp) != NULL) {
+		hlist_for_each_entry_safe(file, pos, next, &nlm_files[i], f_list) {
 			file->f_count++;
 			mutex_unlock(&nlm_file_mutex);
 
 			/* Traverse locks, blocks and shares of this file
 			 * and update file->f_locks count */
-			if (nlm_inspect_file(host, file, action))
+			if (nlm_inspect_file(host, file, match))
 				ret = 1;
 
 			mutex_lock(&nlm_file_mutex);
 			file->f_count--;
 			/* No more references to this file. Let go of it. */
-			if (!file->f_blocks && !file->f_locks
+			if (list_empty(&file->f_blocks) && !file->f_locks
 			 && !file->f_shares && !file->f_count) {
-				*fp = file->f_next;
+				hlist_del(&file->f_list);
 				nlmsvc_ops->fclose(file->f_file);
 				kfree(file);
-			} else {
-				fp = &file->f_next;
 			}
 		}
 	}
@@ -287,23 +281,63 @@ nlm_release_file(struct nlm_file *file)
 	mutex_lock(&nlm_file_mutex);
 
 	/* If there are no more locks etc, delete the file */
-	if(--file->f_count == 0) {
-		if(!nlm_inspect_file(NULL, file, NLM_ACT_CHECK))
-			nlm_delete_file(file);
-	}
+	if (--file->f_count == 0 && !nlm_file_inuse(file))
+		nlm_delete_file(file);
 
 	mutex_unlock(&nlm_file_mutex);
 }
 
 /*
+ * Helpers function for resource traversal
+ *
+ * nlmsvc_mark_host:
+ *	used by the garbage collector; simply sets h_inuse.
+ *	Always returns 0.
+ *
+ * nlmsvc_same_host:
+ *	returns 1 iff the two hosts match. Used to release
+ *	all resources bound to a specific host.
+ *
+ * nlmsvc_is_client:
+ *	returns 1 iff the host is a client.
+ *	Used by nlmsvc_invalidate_all
+ */
+static int
+nlmsvc_mark_host(struct nlm_host *host, struct nlm_host *dummy)
+{
+	host->h_inuse = 1;
+	return 0;
+}
+
+static int
+nlmsvc_same_host(struct nlm_host *host, struct nlm_host *other)
+{
+	return host == other;
+}
+
+static int
+nlmsvc_is_client(struct nlm_host *host, struct nlm_host *dummy)
+{
+	if (host->h_server) {
+		/* we are destroying locks even though the client
+		 * hasn't asked us too, so don't unmonitor the
+		 * client
+		 */
+		if (host->h_nsmhandle)
+			host->h_nsmhandle->sm_sticky = 1;
+		return 1;
+	} else
+		return 0;
+}
+
+/*
  * Mark all hosts that still hold resources
  */
 void
 nlmsvc_mark_resources(void)
 {
 	dprintk("lockd: nlmsvc_mark_resources\n");
-
-	nlm_traverse_files(NULL, NLM_ACT_MARK);
+	nlm_traverse_files(NULL, nlmsvc_mark_host);
 }
 
 /*
@@ -314,23 +348,25 @@ nlmsvc_free_host_resources(struct nlm_host *host)
 {
 	dprintk("lockd: nlmsvc_free_host_resources\n");
 
-	if (nlm_traverse_files(host, NLM_ACT_UNLOCK))
+	if (nlm_traverse_files(host, nlmsvc_same_host)) {
 		printk(KERN_WARNING
-			"lockd: couldn't remove all locks held by %s",
+			"lockd: couldn't remove all locks held by %s\n",
 			host->h_name);
+		BUG();
+	}
 }
 
 /*
- * delete all hosts structs for clients
+ * Remove all locks held for clients
  */
 void
 nlmsvc_invalidate_all(void)
 {
-	struct nlm_host *host;
-	while ((host = nlm_find_client()) != NULL) {
-		nlmsvc_free_host_resources(host);
-		host->h_expires = 0;
-		host->h_killed = 1;
-		nlm_release_host(host);
-	}
+	/* Release all locks held by NFS clients.
+	 * Previously, the code would call
+	 * nlmsvc_free_host_resources for each client in
+	 * turn, which is about as inefficient as it gets.
+	 * Now we just do it once in nlm_traverse_files.
+	 */
+	nlm_traverse_files(NULL, nlmsvc_is_client);
 }
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 033ea4ac2c30..b7c949256e5a 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -43,7 +43,7 @@ loff_t_to_s32(loff_t offset)
 /*
  * XDR functions for basic NLM types
  */
-static u32 *nlm_decode_cookie(u32 *p, struct nlm_cookie *c)
+static __be32 *nlm_decode_cookie(__be32 *p, struct nlm_cookie *c)
 {
 	unsigned int	len;
 
@@ -69,8 +69,8 @@ static u32 *nlm_decode_cookie(u32 *p, struct nlm_cookie *c)
 	return p;
 }
 
-static inline u32 *
-nlm_encode_cookie(u32 *p, struct nlm_cookie *c)
+static inline __be32 *
+nlm_encode_cookie(__be32 *p, struct nlm_cookie *c)
 {
 	*p++ = htonl(c->len);
 	memcpy(p, c->data, c->len);
@@ -78,8 +78,8 @@ nlm_encode_cookie(u32 *p, struct nlm_cookie *c)
 	return p;
 }
 
-static u32 *
-nlm_decode_fh(u32 *p, struct nfs_fh *f)
+static __be32 *
+nlm_decode_fh(__be32 *p, struct nfs_fh *f)
 {
 	unsigned int	len;
 
@@ -95,8 +95,8 @@ nlm_decode_fh(u32 *p, struct nfs_fh *f)
 	return p + XDR_QUADLEN(NFS2_FHSIZE);
 }
 
-static inline u32 *
-nlm_encode_fh(u32 *p, struct nfs_fh *f)
+static inline __be32 *
+nlm_encode_fh(__be32 *p, struct nfs_fh *f)
 {
 	*p++ = htonl(NFS2_FHSIZE);
 	memcpy(p, f->data, NFS2_FHSIZE);
@@ -106,20 +106,20 @@ nlm_encode_fh(u32 *p, struct nfs_fh *f)
 /*
  * Encode and decode owner handle
  */
-static inline u32 *
-nlm_decode_oh(u32 *p, struct xdr_netobj *oh)
+static inline __be32 *
+nlm_decode_oh(__be32 *p, struct xdr_netobj *oh)
 {
 	return xdr_decode_netobj(p, oh);
 }
 
-static inline u32 *
-nlm_encode_oh(u32 *p, struct xdr_netobj *oh)
+static inline __be32 *
+nlm_encode_oh(__be32 *p, struct xdr_netobj *oh)
 {
 	return xdr_encode_netobj(p, oh);
 }
 
-static u32 *
-nlm_decode_lock(u32 *p, struct nlm_lock *lock)
+static __be32 *
+nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
 {
 	struct file_lock	*fl = &lock->fl;
 	s32			start, len, end;
@@ -153,8 +153,8 @@ nlm_decode_lock(u32 *p, struct nlm_lock *lock)
 /*
  * Encode a lock as part of an NLM call
  */
-static u32 *
-nlm_encode_lock(u32 *p, struct nlm_lock *lock)
+static __be32 *
+nlm_encode_lock(__be32 *p, struct nlm_lock *lock)
 {
 	struct file_lock	*fl = &lock->fl;
 	__s32			start, len;
@@ -184,8 +184,8 @@ nlm_encode_lock(u32 *p, struct nlm_lock *lock)
 /*
  * Encode result of a TEST/TEST_MSG call
  */
-static u32 *
-nlm_encode_testres(u32 *p, struct nlm_res *resp)
+static __be32 *
+nlm_encode_testres(__be32 *p, struct nlm_res *resp)
 {
 	s32		start, len;
 
@@ -221,7 +221,7 @@ nlm_encode_testres(u32 *p, struct nlm_res *resp)
  * First, the server side XDR functions
  */
 int
-nlmsvc_decode_testargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlmsvc_decode_testargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
 	u32	exclusive;
 
@@ -238,7 +238,7 @@ nlmsvc_decode_testargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 
 int
-nlmsvc_encode_testres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlmsvc_encode_testres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
 {
 	if (!(p = nlm_encode_testres(p, resp)))
 		return 0;
@@ -246,7 +246,7 @@ nlmsvc_encode_testres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
 }
 
 int
-nlmsvc_decode_lockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlmsvc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
 	u32	exclusive;
 
@@ -266,7 +266,7 @@ nlmsvc_decode_lockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 
 int
-nlmsvc_decode_cancargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlmsvc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
 	u32	exclusive;
 
@@ -282,7 +282,7 @@ nlmsvc_decode_cancargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 
 int
-nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
 	if (!(p = nlm_decode_cookie(p, &argp->cookie))
 	 || !(p = nlm_decode_lock(p, &argp->lock)))
@@ -292,7 +292,7 @@ nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 
 int
-nlmsvc_decode_shareargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
 	struct nlm_lock	*lock = &argp->lock;
 
@@ -313,7 +313,7 @@ nlmsvc_decode_shareargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 
 int
-nlmsvc_encode_shareres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlmsvc_encode_shareres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
 {
 	if (!(p = nlm_encode_cookie(p, &resp->cookie)))
 		return 0;
@@ -323,7 +323,7 @@ nlmsvc_encode_shareres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
 }
 
 int
-nlmsvc_encode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlmsvc_encode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
 {
 	if (!(p = nlm_encode_cookie(p, &resp->cookie)))
 		return 0;
@@ -332,7 +332,7 @@ nlmsvc_encode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
 }
 
 int
-nlmsvc_decode_notify(struct svc_rqst *rqstp, u32 *p, struct nlm_args *argp)
+nlmsvc_decode_notify(struct svc_rqst *rqstp, __be32 *p, struct nlm_args *argp)
 {
 	struct nlm_lock	*lock = &argp->lock;
 
@@ -344,7 +344,7 @@ nlmsvc_decode_notify(struct svc_rqst *rqstp, u32 *p, struct nlm_args *argp)
 }
 
 int
-nlmsvc_decode_reboot(struct svc_rqst *rqstp, u32 *p, struct nlm_reboot *argp)
+nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
 {
 	if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
 		return 0;
@@ -357,7 +357,7 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, u32 *p, struct nlm_reboot *argp)
 }
 
 int
-nlmsvc_decode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlmsvc_decode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
 {
 	if (!(p = nlm_decode_cookie(p, &resp->cookie)))
 		return 0;
@@ -366,13 +366,13 @@ nlmsvc_decode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
 }
 
 int
-nlmsvc_decode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nlmsvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
 	return xdr_argsize_check(rqstp, p);
 }
 
 int
-nlmsvc_encode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
 	return xdr_ressize_check(rqstp, p);
 }
@@ -389,7 +389,7 @@ nlmclt_decode_void(struct rpc_rqst *req, u32 *p, void *ptr)
 #endif
 
 static int
-nlmclt_encode_testargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlmclt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
 {
 	struct nlm_lock	*lock = &argp->lock;
 
@@ -403,7 +403,7 @@ nlmclt_encode_testargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
 }
 
 static int
-nlmclt_decode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlmclt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 {
 	if (!(p = nlm_decode_cookie(p, &resp->cookie)))
 		return -EIO;
@@ -438,7 +438,7 @@ nlmclt_decode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
 
 
 static int
-nlmclt_encode_lockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlmclt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
 {
 	struct nlm_lock	*lock = &argp->lock;
 
@@ -455,7 +455,7 @@ nlmclt_encode_lockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
 }
 
 static int
-nlmclt_encode_cancargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlmclt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
 {
 	struct nlm_lock	*lock = &argp->lock;
 
@@ -470,7 +470,7 @@ nlmclt_encode_cancargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
 }
 
 static int
-nlmclt_encode_unlockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlmclt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
 {
 	struct nlm_lock	*lock = &argp->lock;
 
@@ -483,7 +483,7 @@ nlmclt_encode_unlockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
 }
 
 static int
-nlmclt_encode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlmclt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 {
 	if (!(p = nlm_encode_cookie(p, &resp->cookie)))
 		return -EIO;
@@ -493,7 +493,7 @@ nlmclt_encode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
 }
 
 static int
-nlmclt_encode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlmclt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 {
 	if (!(p = nlm_encode_testres(p, resp)))
 		return -EIO;
@@ -502,7 +502,7 @@ nlmclt_encode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
 }
 
 static int
-nlmclt_decode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlmclt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 {
 	if (!(p = nlm_decode_cookie(p, &resp->cookie)))
 		return -EIO;
@@ -515,7 +515,7 @@ nlmclt_decode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
  */
 #define NLM_void_sz		0
 #define NLM_cookie_sz		1+XDR_QUADLEN(NLM_MAXCOOKIELEN)
-#define NLM_caller_sz		1+XDR_QUADLEN(sizeof(system_utsname.nodename))
+#define NLM_caller_sz		1+XDR_QUADLEN(sizeof(utsname()->nodename))
 #define NLM_netobj_sz		1+XDR_QUADLEN(XDR_MAX_NETOBJ)
 /* #define NLM_owner_sz		1+XDR_QUADLEN(NLM_MAXOWNER) */
 #define NLM_fhandle_sz		1+XDR_QUADLEN(NFS2_FHSIZE)
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 36eb175ec335..f4c0b2b9f75a 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -44,8 +44,8 @@ loff_t_to_s64(loff_t offset)
 /*
  * XDR functions for basic NLM types
  */
-static u32 *
-nlm4_decode_cookie(u32 *p, struct nlm_cookie *c)
+static __be32 *
+nlm4_decode_cookie(__be32 *p, struct nlm_cookie *c)
 {
 	unsigned int	len;
 
@@ -71,8 +71,8 @@ nlm4_decode_cookie(u32 *p, struct nlm_cookie *c)
 	return p;
 }
 
-static u32 *
-nlm4_encode_cookie(u32 *p, struct nlm_cookie *c)
+static __be32 *
+nlm4_encode_cookie(__be32 *p, struct nlm_cookie *c)
 {
 	*p++ = htonl(c->len);
 	memcpy(p, c->data, c->len);
@@ -80,8 +80,8 @@ nlm4_encode_cookie(u32 *p, struct nlm_cookie *c)
 	return p;
 }
 
-static u32 *
-nlm4_decode_fh(u32 *p, struct nfs_fh *f)
+static __be32 *
+nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
 {
 	memset(f->data, 0, sizeof(f->data));
 	f->size = ntohl(*p++);
@@ -95,8 +95,8 @@ nlm4_decode_fh(u32 *p, struct nfs_fh *f)
 	return p + XDR_QUADLEN(f->size);
 }
 
-static u32 *
-nlm4_encode_fh(u32 *p, struct nfs_fh *f)
+static __be32 *
+nlm4_encode_fh(__be32 *p, struct nfs_fh *f)
 {
 	*p++ = htonl(f->size);
 	if (f->size) p[XDR_QUADLEN(f->size)-1] = 0; /* don't leak anything */
@@ -107,20 +107,20 @@ nlm4_encode_fh(u32 *p, struct nfs_fh *f)
 /*
  * Encode and decode owner handle
  */
-static u32 *
-nlm4_decode_oh(u32 *p, struct xdr_netobj *oh)
+static __be32 *
+nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh)
 {
 	return xdr_decode_netobj(p, oh);
 }
 
-static u32 *
-nlm4_encode_oh(u32 *p, struct xdr_netobj *oh)
+static __be32 *
+nlm4_encode_oh(__be32 *p, struct xdr_netobj *oh)
 {
 	return xdr_encode_netobj(p, oh);
 }
 
-static u32 *
-nlm4_decode_lock(u32 *p, struct nlm_lock *lock)
+static __be32 *
+nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
 {
 	struct file_lock	*fl = &lock->fl;
 	__s64			len, start, end;
@@ -153,8 +153,8 @@ nlm4_decode_lock(u32 *p, struct nlm_lock *lock)
 /*
  * Encode a lock as part of an NLM call
  */
-static u32 *
-nlm4_encode_lock(u32 *p, struct nlm_lock *lock)
+static __be32 *
+nlm4_encode_lock(__be32 *p, struct nlm_lock *lock)
 {
 	struct file_lock	*fl = &lock->fl;
 	__s64			start, len;
@@ -185,8 +185,8 @@ nlm4_encode_lock(u32 *p, struct nlm_lock *lock)
 /*
  * Encode result of a TEST/TEST_MSG call
  */
-static u32 *
-nlm4_encode_testres(u32 *p, struct nlm_res *resp)
+static __be32 *
+nlm4_encode_testres(__be32 *p, struct nlm_res *resp)
 {
 	s64		start, len;
 
@@ -227,7 +227,7 @@ nlm4_encode_testres(u32 *p, struct nlm_res *resp)
  * First, the server side XDR functions
  */
 int
-nlm4svc_decode_testargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlm4svc_decode_testargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
 	u32	exclusive;
 
@@ -244,7 +244,7 @@ nlm4svc_decode_testargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 
 int
-nlm4svc_encode_testres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlm4svc_encode_testres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
 {
 	if (!(p = nlm4_encode_testres(p, resp)))
 		return 0;
@@ -252,7 +252,7 @@ nlm4svc_encode_testres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
 }
 
 int
-nlm4svc_decode_lockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlm4svc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
 	u32	exclusive;
 
@@ -272,7 +272,7 @@ nlm4svc_decode_lockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 
 int
-nlm4svc_decode_cancargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlm4svc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
 	u32	exclusive;
 
@@ -288,7 +288,7 @@ nlm4svc_decode_cancargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 
 int
-nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
 	if (!(p = nlm4_decode_cookie(p, &argp->cookie))
 	 || !(p = nlm4_decode_lock(p, &argp->lock)))
@@ -298,7 +298,7 @@ nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 
 int
-nlm4svc_decode_shareargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
+nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p, nlm_args *argp)
 {
 	struct nlm_lock	*lock = &argp->lock;
 
@@ -319,7 +319,7 @@ nlm4svc_decode_shareargs(struct svc_rqst *rqstp, u32 *p, nlm_args *argp)
 }
 
 int
-nlm4svc_encode_shareres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlm4svc_encode_shareres(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
 {
 	if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
 		return 0;
@@ -329,7 +329,7 @@ nlm4svc_encode_shareres(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
 }
 
 int
-nlm4svc_encode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlm4svc_encode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
 {
 	if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
 		return 0;
@@ -338,7 +338,7 @@ nlm4svc_encode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
 }
 
 int
-nlm4svc_decode_notify(struct svc_rqst *rqstp, u32 *p, struct nlm_args *argp)
+nlm4svc_decode_notify(struct svc_rqst *rqstp, __be32 *p, struct nlm_args *argp)
 {
 	struct nlm_lock	*lock = &argp->lock;
 
@@ -350,7 +350,7 @@ nlm4svc_decode_notify(struct svc_rqst *rqstp, u32 *p, struct nlm_args *argp)
 }
 
 int
-nlm4svc_decode_reboot(struct svc_rqst *rqstp, u32 *p, struct nlm_reboot *argp)
+nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
 {
 	if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
 		return 0;
@@ -363,7 +363,7 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, u32 *p, struct nlm_reboot *argp)
 }
 
 int
-nlm4svc_decode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
+nlm4svc_decode_res(struct svc_rqst *rqstp, __be32 *p, struct nlm_res *resp)
 {
 	if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
 		return 0;
@@ -372,13 +372,13 @@ nlm4svc_decode_res(struct svc_rqst *rqstp, u32 *p, struct nlm_res *resp)
 }
 
 int
-nlm4svc_decode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nlm4svc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
 	return xdr_argsize_check(rqstp, p);
 }
 
 int
-nlm4svc_encode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
 	return xdr_ressize_check(rqstp, p);
 }
@@ -388,14 +388,14 @@ nlm4svc_encode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
  */
 #ifdef NLMCLNT_SUPPORT_SHARES
 static int
-nlm4clt_decode_void(struct rpc_rqst *req, u32 *p, void *ptr)
+nlm4clt_decode_void(struct rpc_rqst *req, __be32 *p, void *ptr)
 {
 	return 0;
 }
 #endif
 
 static int
-nlm4clt_encode_testargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlm4clt_encode_testargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
 {
 	struct nlm_lock	*lock = &argp->lock;
 
@@ -409,7 +409,7 @@ nlm4clt_encode_testargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
 }
 
 static int
-nlm4clt_decode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlm4clt_decode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 {
 	if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
 		return -EIO;
@@ -444,7 +444,7 @@ nlm4clt_decode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
 
 
 static int
-nlm4clt_encode_lockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlm4clt_encode_lockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
 {
 	struct nlm_lock	*lock = &argp->lock;
 
@@ -461,7 +461,7 @@ nlm4clt_encode_lockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
 }
 
 static int
-nlm4clt_encode_cancargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlm4clt_encode_cancargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
 {
 	struct nlm_lock	*lock = &argp->lock;
 
@@ -476,7 +476,7 @@ nlm4clt_encode_cancargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
 }
 
 static int
-nlm4clt_encode_unlockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
+nlm4clt_encode_unlockargs(struct rpc_rqst *req, __be32 *p, nlm_args *argp)
 {
 	struct nlm_lock	*lock = &argp->lock;
 
@@ -489,7 +489,7 @@ nlm4clt_encode_unlockargs(struct rpc_rqst *req, u32 *p, nlm_args *argp)
 }
 
 static int
-nlm4clt_encode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlm4clt_encode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 {
 	if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
 		return -EIO;
@@ -499,7 +499,7 @@ nlm4clt_encode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
 }
 
 static int
-nlm4clt_encode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlm4clt_encode_testres(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 {
 	if (!(p = nlm4_encode_testres(p, resp)))
 		return -EIO;
@@ -508,7 +508,7 @@ nlm4clt_encode_testres(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
 }
 
 static int
-nlm4clt_decode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp)
+nlm4clt_decode_res(struct rpc_rqst *req, __be32 *p, struct nlm_res *resp)
 {
 	if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
 		return -EIO;
diff --git a/fs/locks.c b/fs/locks.c
index d7c53392cac1..e0b6a80649a0 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -314,13 +314,13 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
 	off_t start, end;
 
 	switch (l->l_whence) {
-	case 0: /*SEEK_SET*/
+	case SEEK_SET:
 		start = 0;
 		break;
-	case 1: /*SEEK_CUR*/
+	case SEEK_CUR:
 		start = filp->f_pos;
 		break;
-	case 2: /*SEEK_END*/
+	case SEEK_END:
 		start = i_size_read(filp->f_dentry->d_inode);
 		break;
 	default:
@@ -364,13 +364,13 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
 	loff_t start;
 
 	switch (l->l_whence) {
-	case 0: /*SEEK_SET*/
+	case SEEK_SET:
 		start = 0;
 		break;
-	case 1: /*SEEK_CUR*/
+	case SEEK_CUR:
 		start = filp->f_pos;
 		break;
-	case 2: /*SEEK_END*/
+	case SEEK_END:
 		start = i_size_read(filp->f_dentry->d_inode);
 		break;
 	default:
@@ -1514,7 +1514,7 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 		goto out_unlock;
 	}
 
-	error = f_setown(filp, current->pid, 0);
+	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
 out_unlock:
 	unlock_kernel();
 	return error;
diff --git a/fs/mbcache.c b/fs/mbcache.c
index e4fde1ab22cd..0ff71256e65b 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -160,6 +160,7 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
 
 static void
 __mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
+	__releases(mb_cache_spinlock)
 {
 	/* Wake up all processes queuing for this cache entry. */
 	if (ce->e_queued)
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 4a6abc49418e..df6b1075b549 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -254,7 +254,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
 	inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current->fsgid;
 	inode->i_ino = j;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-	inode->i_blocks = inode->i_blksize = 0;
+	inode->i_blocks = 0;
 	memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u));
 	insert_inode_hash(inode);
 	mark_inode_dirty(inode);
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 420b32882a10..40eac2e60d25 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -17,8 +17,10 @@ int minix_sync_file(struct file *, struct dentry *, int);
 
 const struct file_operations minix_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.fsync		= minix_sync_file,
 	.sendfile	= generic_file_sendfile,
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 330ff9fc7cf0..1e36bae4d0eb 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -90,8 +90,7 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(minix_inode_cachep))
-		printk(KERN_INFO "minix_inode_cache: not all structures were freed\n");
+	kmem_cache_destroy(minix_inode_cachep);
 }
 
 static struct super_operations minix_sops = {
@@ -145,18 +144,13 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
 	struct inode *root_inode;
 	struct minix_sb_info *sbi;
 
-	sbi = kmalloc(sizeof(struct minix_sb_info), GFP_KERNEL);
+	sbi = kzalloc(sizeof(struct minix_sb_info), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
 	s->s_fs_info = sbi;
-	memset(sbi, 0, sizeof(struct minix_sb_info));
 
-	/* N.B. These should be compile-time tests.
-	   Unfortunately that is impossible. */
-	if (32 != sizeof (struct minix_inode))
-		panic("bad V1 i-node size");
-	if (64 != sizeof(struct minix2_inode))
-		panic("bad V2 i-node size");
+	BUILD_BUG_ON(32 != sizeof (struct minix_inode));
+	BUILD_BUG_ON(64 != sizeof(struct minix2_inode));
 
 	if (!sb_set_blocksize(s, BLOCK_SIZE))
 		goto out_bad_hblock;
@@ -207,10 +201,9 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
 	if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0)
 		goto out_illegal_sb;
 	i = (sbi->s_imap_blocks + sbi->s_zmap_blocks) * sizeof(bh);
-	map = kmalloc(i, GFP_KERNEL);
+	map = kzalloc(i, GFP_KERNEL);
 	if (!map)
 		goto out_no_map;
-	memset(map, 0, i);
 	sbi->s_imap = &map[0];
 	sbi->s_zmap = &map[sbi->s_imap_blocks];
 
@@ -399,7 +392,7 @@ static void V1_minix_read_inode(struct inode * inode)
 	inode->i_mtime.tv_nsec = 0;
 	inode->i_atime.tv_nsec = 0;
 	inode->i_ctime.tv_nsec = 0;
-	inode->i_blocks = inode->i_blksize = 0;
+	inode->i_blocks = 0;
 	for (i = 0; i < 9; i++)
 		minix_inode->u.i1_data[i] = raw_inode->i_zone[i];
 	minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0]));
@@ -432,7 +425,7 @@ static void V2_minix_read_inode(struct inode * inode)
 	inode->i_mtime.tv_nsec = 0;
 	inode->i_atime.tv_nsec = 0;
 	inode->i_ctime.tv_nsec = 0;
-	inode->i_blocks = inode->i_blksize = 0;
+	inode->i_blocks = 0;
 	for (i = 0; i < 10; i++)
 		minix_inode->u.i2_data[i] = raw_inode->i_zone[i];
 	minix_set_inode(inode, old_decode_dev(raw_inode->i_zone[0]));
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 5b6a4540a05b..299bb66e3bde 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -249,7 +249,7 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
 		minix_set_link(new_de, new_page, old_inode);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
-			new_inode->i_nlink--;
+			drop_nlink(new_inode);
 		inode_dec_link_count(new_inode);
 	} else {
 		if (dir_de) {
diff --git a/fs/mpage.c b/fs/mpage.c
index 1e4598247d0b..692a3e578fc8 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -693,6 +693,8 @@ out:
  * the call was made get new I/O started against them.  If wbc->sync_mode is
  * WB_SYNC_ALL then we were called for data integrity and we must wait for
  * existing IO to complete.
+ *
+ * If you fix this you should check generic_writepages() also!
  */
 int
 mpage_writepages(struct address_space *mapping,
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 9e44158a7540..b0f01b3b0536 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -280,7 +280,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
 			struct nameidata *nd)
 {
 	struct super_block *sb = dir->i_sb;
-	struct inode *inode;
+	struct inode *inode = NULL;
 	struct fat_slot_info sinfo;
 	struct timespec ts;
 	unsigned char msdos_name[MSDOS_NAME];
@@ -316,6 +316,8 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
 	d_instantiate(dentry, inode);
 out:
 	unlock_kernel();
+	if (!err)
+		err = fat_flush_inodes(sb, dir, inode);
 	return err;
 }
 
@@ -341,13 +343,15 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
 	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
 	if (err)
 		goto out;
-	dir->i_nlink--;
+	drop_nlink(dir);
 
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	fat_detach(inode);
 out:
 	unlock_kernel();
+	if (!err)
+		err = fat_flush_inodes(inode->i_sb, dir, inode);
 
 	return err;
 }
@@ -385,7 +389,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo);
 	if (err)
 		goto out_free;
-	dir->i_nlink++;
+	inc_nlink(dir);
 
 	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
 	brelse(sinfo.bh);
@@ -401,6 +405,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	d_instantiate(dentry, inode);
 
 	unlock_kernel();
+	fat_flush_inodes(sb, dir, inode);
 	return 0;
 
 out_free:
@@ -425,11 +430,13 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
 	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
 	if (err)
 		goto out;
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	inode->i_ctime = CURRENT_TIME_SEC;
 	fat_detach(inode);
 out:
 	unlock_kernel();
+	if (!err)
+		err = fat_flush_inodes(inode->i_sb, dir, inode);
 
 	return err;
 }
@@ -542,9 +549,9 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
 			if (err)
 				goto error_dotdot;
 		}
-		old_dir->i_nlink--;
+		drop_nlink(old_dir);
 		if (!new_inode)
-			new_dir->i_nlink++;
+			inc_nlink(new_dir);
 	}
 
 	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
@@ -559,10 +566,9 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
 		mark_inode_dirty(old_dir);
 
 	if (new_inode) {
+		drop_nlink(new_inode);
 		if (is_dir)
-			new_inode->i_nlink -= 2;
-		else
-			new_inode->i_nlink--;
+			drop_nlink(new_inode);
 		new_inode->i_ctime = ts;
 	}
 out:
@@ -635,6 +641,8 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
 			      new_dir, new_msdos_name, new_dentry, is_hid);
 out:
 	unlock_kernel();
+	if (!err)
+		err = fat_flush_inodes(old_dir->i_sb, old_dir, new_dir);
 	return err;
 }
 
diff --git a/fs/namei.c b/fs/namei.c
index 6b591c01b09f..28d49b301d55 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -372,6 +372,30 @@ void release_open_intent(struct nameidata *nd)
 		fput(nd->intent.open.file);
 }
 
+static inline struct dentry *
+do_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	int status = dentry->d_op->d_revalidate(dentry, nd);
+	if (unlikely(status <= 0)) {
+		/*
+		 * The dentry failed validation.
+		 * If d_revalidate returned 0 attempt to invalidate
+		 * the dentry otherwise d_revalidate is asking us
+		 * to return a fail status.
+		 */
+		if (!status) {
+			if (!d_invalidate(dentry)) {
+				dput(dentry);
+				dentry = NULL;
+			}
+		} else {
+			dput(dentry);
+			dentry = ERR_PTR(status);
+		}
+	}
+	return dentry;
+}
+
 /*
  * Internal lookup() using the new generic dcache.
  * SMP-safe
@@ -386,12 +410,9 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
 	if (!dentry)
 		dentry = d_lookup(parent, name);
 
-	if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
-		if (!dentry->d_op->d_revalidate(dentry, nd) && !d_invalidate(dentry)) {
-			dput(dentry);
-			dentry = NULL;
-		}
-	}
+	if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
+		dentry = do_revalidate(dentry, nd);
+
 	return dentry;
 }
 
@@ -484,10 +505,9 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
 	 */
 	mutex_unlock(&dir->i_mutex);
 	if (result->d_op && result->d_op->d_revalidate) {
-		if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) {
-			dput(result);
+		result = do_revalidate(result, nd);
+		if (!result)
 			result = ERR_PTR(-ENOENT);
-		}
 	}
 	return result;
 }
@@ -498,18 +518,20 @@ static int __emul_lookup_dentry(const char *, struct nameidata *);
 static __always_inline int
 walk_init_root(const char *name, struct nameidata *nd)
 {
-	read_lock(&current->fs->lock);
-	if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
-		nd->mnt = mntget(current->fs->altrootmnt);
-		nd->dentry = dget(current->fs->altroot);
-		read_unlock(&current->fs->lock);
+	struct fs_struct *fs = current->fs;
+
+	read_lock(&fs->lock);
+	if (fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
+		nd->mnt = mntget(fs->altrootmnt);
+		nd->dentry = dget(fs->altroot);
+		read_unlock(&fs->lock);
 		if (__emul_lookup_dentry(name,nd))
 			return 0;
-		read_lock(&current->fs->lock);
+		read_lock(&fs->lock);
 	}
-	nd->mnt = mntget(current->fs->rootmnt);
-	nd->dentry = dget(current->fs->root);
-	read_unlock(&current->fs->lock);
+	nd->mnt = mntget(fs->rootmnt);
+	nd->dentry = dget(fs->root);
+	read_unlock(&fs->lock);
 	return 1;
 }
 
@@ -704,17 +726,19 @@ int follow_down(struct vfsmount **mnt, struct dentry **dentry)
 
 static __always_inline void follow_dotdot(struct nameidata *nd)
 {
+	struct fs_struct *fs = current->fs;
+
 	while(1) {
 		struct vfsmount *parent;
 		struct dentry *old = nd->dentry;
 
-                read_lock(&current->fs->lock);
-		if (nd->dentry == current->fs->root &&
-		    nd->mnt == current->fs->rootmnt) {
-                        read_unlock(&current->fs->lock);
+                read_lock(&fs->lock);
+		if (nd->dentry == fs->root &&
+		    nd->mnt == fs->rootmnt) {
+                        read_unlock(&fs->lock);
 			break;
 		}
-                read_unlock(&current->fs->lock);
+                read_unlock(&fs->lock);
 		spin_lock(&dcache_lock);
 		if (nd->dentry != nd->mnt->mnt_root) {
 			nd->dentry = dget(nd->dentry->d_parent);
@@ -767,12 +791,12 @@ need_lookup:
 	goto done;
 
 need_revalidate:
-	if (dentry->d_op->d_revalidate(dentry, nd))
-		goto done;
-	if (d_invalidate(dentry))
-		goto done;
-	dput(dentry);
-	goto need_lookup;
+	dentry = do_revalidate(dentry, nd);
+	if (!dentry)
+		goto need_lookup;
+	if (IS_ERR(dentry))
+		goto fail;
+	goto done;
 
 fail:
 	return PTR_ERR(dentry);
@@ -1022,15 +1046,17 @@ static int __emul_lookup_dentry(const char *name, struct nameidata *nd)
 		struct vfsmount *old_mnt = nd->mnt;
 		struct qstr last = nd->last;
 		int last_type = nd->last_type;
+		struct fs_struct *fs = current->fs;
+
 		/*
-		 * NAME was not found in alternate root or it's a directory.  Try to find
-		 * it in the normal root:
+		 * NAME was not found in alternate root or it's a directory.
+		 * Try to find it in the normal root:
 		 */
 		nd->last_type = LAST_ROOT;
-		read_lock(&current->fs->lock);
-		nd->mnt = mntget(current->fs->rootmnt);
-		nd->dentry = dget(current->fs->root);
-		read_unlock(&current->fs->lock);
+		read_lock(&fs->lock);
+		nd->mnt = mntget(fs->rootmnt);
+		nd->dentry = dget(fs->root);
+		read_unlock(&fs->lock);
 		if (path_walk(name, nd) == 0) {
 			if (nd->dentry->d_inode) {
 				dput(old_dentry);
@@ -1054,6 +1080,7 @@ void set_fs_altroot(void)
 	struct vfsmount *mnt = NULL, *oldmnt;
 	struct dentry *dentry = NULL, *olddentry;
 	int err;
+	struct fs_struct *fs = current->fs;
 
 	if (!emul)
 		goto set_it;
@@ -1063,12 +1090,12 @@ void set_fs_altroot(void)
 		dentry = nd.dentry;
 	}
 set_it:
-	write_lock(&current->fs->lock);
-	oldmnt = current->fs->altrootmnt;
-	olddentry = current->fs->altroot;
-	current->fs->altrootmnt = mnt;
-	current->fs->altroot = dentry;
-	write_unlock(&current->fs->lock);
+	write_lock(&fs->lock);
+	oldmnt = fs->altrootmnt;
+	olddentry = fs->altroot;
+	fs->altrootmnt = mnt;
+	fs->altroot = dentry;
+	write_unlock(&fs->lock);
 	if (olddentry) {
 		dput(olddentry);
 		mntput(oldmnt);
@@ -1082,29 +1109,30 @@ static int fastcall do_path_lookup(int dfd, const char *name,
 	int retval = 0;
 	int fput_needed;
 	struct file *file;
+	struct fs_struct *fs = current->fs;
 
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
 	nd->flags = flags;
 	nd->depth = 0;
 
 	if (*name=='/') {
-		read_lock(&current->fs->lock);
-		if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
-			nd->mnt = mntget(current->fs->altrootmnt);
-			nd->dentry = dget(current->fs->altroot);
-			read_unlock(&current->fs->lock);
+		read_lock(&fs->lock);
+		if (fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
+			nd->mnt = mntget(fs->altrootmnt);
+			nd->dentry = dget(fs->altroot);
+			read_unlock(&fs->lock);
 			if (__emul_lookup_dentry(name,nd))
 				goto out; /* found in altroot */
-			read_lock(&current->fs->lock);
+			read_lock(&fs->lock);
 		}
-		nd->mnt = mntget(current->fs->rootmnt);
-		nd->dentry = dget(current->fs->root);
-		read_unlock(&current->fs->lock);
+		nd->mnt = mntget(fs->rootmnt);
+		nd->dentry = dget(fs->root);
+		read_unlock(&fs->lock);
 	} else if (dfd == AT_FDCWD) {
-		read_lock(&current->fs->lock);
-		nd->mnt = mntget(current->fs->pwdmnt);
-		nd->dentry = dget(current->fs->pwd);
-		read_unlock(&current->fs->lock);
+		read_lock(&fs->lock);
+		nd->mnt = mntget(fs->pwdmnt);
+		nd->dentry = dget(fs->pwd);
+		read_unlock(&fs->lock);
 	} else {
 		struct dentry *dentry;
 
@@ -1567,6 +1595,24 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 	return 0;
 }
 
+static int open_namei_create(struct nameidata *nd, struct path *path,
+				int flag, int mode)
+{
+	int error;
+	struct dentry *dir = nd->dentry;
+
+	if (!IS_POSIXACL(dir->d_inode))
+		mode &= ~current->fs->umask;
+	error = vfs_create(dir->d_inode, path->dentry, mode, nd);
+	mutex_unlock(&dir->d_inode->i_mutex);
+	dput(nd->dentry);
+	nd->dentry = path->dentry;
+	if (error)
+		return error;
+	/* Don't check for write permission, don't truncate */
+	return may_open(nd, 0, flag & ~O_TRUNC);
+}
+
 /*
  *	open_namei()
  *
@@ -1648,18 +1694,10 @@ do_last:
 
 	/* Negative dentry, just create the file */
 	if (!path.dentry->d_inode) {
-		if (!IS_POSIXACL(dir->d_inode))
-			mode &= ~current->fs->umask;
-		error = vfs_create(dir->d_inode, path.dentry, mode, nd);
-		mutex_unlock(&dir->d_inode->i_mutex);
-		dput(nd->dentry);
-		nd->dentry = path.dentry;
+		error = open_namei_create(nd, &path, flag, mode);
 		if (error)
 			goto exit;
-		/* Don't check for write permission, don't truncate */
-		acc_mode = 0;
-		flag &= ~O_TRUNC;
-		goto ok;
+		return 0;
 	}
 
 	/*
@@ -1906,30 +1944,32 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
 {
 	int error = 0;
 	char * tmp;
+	struct dentry *dentry;
+	struct nameidata nd;
 
 	tmp = getname(pathname);
 	error = PTR_ERR(tmp);
-	if (!IS_ERR(tmp)) {
-		struct dentry *dentry;
-		struct nameidata nd;
+	if (IS_ERR(tmp))
+		goto out_err;
 
-		error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
-		if (error)
-			goto out;
-		dentry = lookup_create(&nd, 1);
-		error = PTR_ERR(dentry);
-		if (!IS_ERR(dentry)) {
-			if (!IS_POSIXACL(nd.dentry->d_inode))
-				mode &= ~current->fs->umask;
-			error = vfs_mkdir(nd.dentry->d_inode, dentry, mode);
-			dput(dentry);
-		}
-		mutex_unlock(&nd.dentry->d_inode->i_mutex);
-		path_release(&nd);
-out:
-		putname(tmp);
-	}
+	error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
+	if (error)
+		goto out;
+	dentry = lookup_create(&nd, 1);
+	error = PTR_ERR(dentry);
+	if (IS_ERR(dentry))
+		goto out_unlock;
 
+	if (!IS_POSIXACL(nd.dentry->d_inode))
+		mode &= ~current->fs->umask;
+	error = vfs_mkdir(nd.dentry->d_inode, dentry, mode);
+	dput(dentry);
+out_unlock:
+	mutex_unlock(&nd.dentry->d_inode->i_mutex);
+	path_release(&nd);
+out:
+	putname(tmp);
+out_err:
 	return error;
 }
 
@@ -2028,10 +2068,11 @@ static long do_rmdir(int dfd, const char __user *pathname)
 	mutex_lock_nested(&nd.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
 	dentry = lookup_hash(&nd);
 	error = PTR_ERR(dentry);
-	if (!IS_ERR(dentry)) {
-		error = vfs_rmdir(nd.dentry->d_inode, dentry);
-		dput(dentry);
-	}
+	if (IS_ERR(dentry))
+		goto exit2;
+	error = vfs_rmdir(nd.dentry->d_inode, dentry);
+	dput(dentry);
+exit2:
 	mutex_unlock(&nd.dentry->d_inode->i_mutex);
 exit1:
 	path_release(&nd);
@@ -2171,30 +2212,33 @@ asmlinkage long sys_symlinkat(const char __user *oldname,
 	int error = 0;
 	char * from;
 	char * to;
+	struct dentry *dentry;
+	struct nameidata nd;
 
 	from = getname(oldname);
 	if(IS_ERR(from))
 		return PTR_ERR(from);
 	to = getname(newname);
 	error = PTR_ERR(to);
-	if (!IS_ERR(to)) {
-		struct dentry *dentry;
-		struct nameidata nd;
+	if (IS_ERR(to))
+		goto out_putname;
 
-		error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
-		if (error)
-			goto out;
-		dentry = lookup_create(&nd, 0);
-		error = PTR_ERR(dentry);
-		if (!IS_ERR(dentry)) {
-			error = vfs_symlink(nd.dentry->d_inode, dentry, from, S_IALLUGO);
-			dput(dentry);
-		}
-		mutex_unlock(&nd.dentry->d_inode->i_mutex);
-		path_release(&nd);
+	error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
+	if (error)
+		goto out;
+	dentry = lookup_create(&nd, 0);
+	error = PTR_ERR(dentry);
+	if (IS_ERR(dentry))
+		goto out_unlock;
+
+	error = vfs_symlink(nd.dentry->d_inode, dentry, from, S_IALLUGO);
+	dput(dentry);
+out_unlock:
+	mutex_unlock(&nd.dentry->d_inode->i_mutex);
+	path_release(&nd);
 out:
-		putname(to);
-	}
+	putname(to);
+out_putname:
 	putname(from);
 	return error;
 }
@@ -2280,10 +2324,11 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
 		goto out_release;
 	new_dentry = lookup_create(&nd, 0);
 	error = PTR_ERR(new_dentry);
-	if (!IS_ERR(new_dentry)) {
-		error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
-		dput(new_dentry);
-	}
+	if (IS_ERR(new_dentry))
+		goto out_unlock;
+	error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
+	dput(new_dentry);
+out_unlock:
 	mutex_unlock(&nd.dentry->d_inode->i_mutex);
 out_release:
 	path_release(&nd);
diff --git a/fs/namespace.c b/fs/namespace.c
index 36d180858136..55442a6cf221 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
 #include <linux/init.h>
+#include <linux/kernel.h>
 #include <linux/quotaops.h>
 #include <linux/acct.h>
 #include <linux/capability.h>
@@ -23,12 +24,11 @@
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/mount.h>
+#include <linux/ramfs.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include "pnode.h"
 
-extern int __init init_rootfs(void);
-
 /* spinlock for vfsmount related operations, inplace of dcache_lock */
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
 
@@ -133,7 +133,7 @@ struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 
 static inline int check_mnt(struct vfsmount *mnt)
 {
-	return mnt->mnt_namespace == current->namespace;
+	return mnt->mnt_namespace == current->nsproxy->namespace;
 }
 
 static void touch_namespace(struct namespace *ns)
@@ -830,7 +830,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
 	if (parent_nd) {
 		detach_mnt(source_mnt, parent_nd);
 		attach_mnt(source_mnt, nd);
-		touch_namespace(current->namespace);
+		touch_namespace(current->nsproxy->namespace);
 	} else {
 		mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
 		commit_tree(source_mnt);
@@ -1441,7 +1441,7 @@ dput_out:
  */
 struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs)
 {
-	struct namespace *namespace = tsk->namespace;
+	struct namespace *namespace = tsk->nsproxy->namespace;
 	struct namespace *new_ns;
 	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL;
 	struct vfsmount *p, *q;
@@ -1508,7 +1508,7 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs)
 
 int copy_namespace(int flags, struct task_struct *tsk)
 {
-	struct namespace *namespace = tsk->namespace;
+	struct namespace *namespace = tsk->nsproxy->namespace;
 	struct namespace *new_ns;
 	int err = 0;
 
@@ -1531,7 +1531,7 @@ int copy_namespace(int flags, struct task_struct *tsk)
 		goto out;
 	}
 
-	tsk->namespace = new_ns;
+	tsk->nsproxy->namespace = new_ns;
 
 out:
 	put_namespace(namespace);
@@ -1754,7 +1754,7 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
 	detach_mnt(user_nd.mnt, &root_parent);
 	attach_mnt(user_nd.mnt, &old_nd);     /* mount old root on put_old */
 	attach_mnt(new_nd.mnt, &root_parent); /* mount new_root on / */
-	touch_namespace(current->namespace);
+	touch_namespace(current->nsproxy->namespace);
 	spin_unlock(&vfsmount_lock);
 	chroot_fs_refs(&user_nd, &new_nd);
 	security_sb_post_pivotroot(&user_nd, &new_nd);
@@ -1780,7 +1780,6 @@ static void __init init_mount_tree(void)
 {
 	struct vfsmount *mnt;
 	struct namespace *namespace;
-	struct task_struct *g, *p;
 
 	mnt = do_kern_mount("rootfs", 0, "rootfs", NULL);
 	if (IS_ERR(mnt))
@@ -1796,13 +1795,8 @@ static void __init init_mount_tree(void)
 	namespace->root = mnt;
 	mnt->mnt_namespace = namespace;
 
-	init_task.namespace = namespace;
-	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		get_namespace(namespace);
-		p->namespace = namespace;
-	} while_each_thread(g, p);
-	read_unlock(&tasklist_lock);
+	init_task.nsproxy->namespace = namespace;
+	get_namespace(namespace);
 
 	set_fs_pwd(current->fs, namespace->root, namespace->root->mnt_root);
 	set_fs_root(current->fs, namespace->root, namespace->root->mnt_root);
@@ -1813,6 +1807,7 @@ void __init mnt_init(unsigned long mempages)
 	struct list_head *d;
 	unsigned int nr_hash;
 	int i;
+	int err;
 
 	init_rwsem(&namespace_sem);
 
@@ -1853,8 +1848,14 @@ void __init mnt_init(unsigned long mempages)
 		d++;
 		i--;
 	} while (i);
-	sysfs_init();
-	subsystem_register(&fs_subsys);
+	err = sysfs_init();
+	if (err)
+		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
+			__FUNCTION__, err);
+	err = subsystem_register(&fs_subsys);
+	if (err)
+		printk(KERN_WARNING "%s: subsystem_register error: %d\n",
+			__FUNCTION__, err);
 	init_rootfs();
 	init_mount_tree();
 }
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index b4ee89250e95..458b3b785194 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -53,6 +53,9 @@ const struct file_operations ncp_dir_operations =
 	.read		= generic_read_dir,
 	.readdir	= ncp_readdir,
 	.ioctl		= ncp_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ncp_compat_ioctl,
+#endif
 };
 
 struct inode_operations ncp_dir_inode_operations =
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index e6b7c67cf057..df37524b85db 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -289,6 +289,9 @@ const struct file_operations ncp_file_operations =
 	.read		= ncp_file_read,
 	.write		= ncp_file_write,
 	.ioctl		= ncp_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ncp_compat_ioctl,
+#endif
 	.mmap		= ncp_mmap,
 	.release	= ncp_release,
 	.fsync		= ncp_fsync,
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 1ddf77b0b825..42e3bef270c9 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -81,8 +81,7 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(ncp_inode_cachep))
-		printk(KERN_INFO "ncp_inode_cache: not all structures were freed\n");
+	kmem_cache_destroy(ncp_inode_cachep);
 }
 
 static int ncp_remount(struct super_block *sb, int *flags, char* data)
@@ -224,7 +223,6 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
 	inode->i_nlink = 1;
 	inode->i_uid = server->m.uid;
 	inode->i_gid = server->m.gid;
-	inode->i_blksize = NCP_BLOCK_SIZE;
 
 	ncp_update_dates(inode, &nwinfo->i);
 	ncp_update_inode(inode, nwinfo);
@@ -411,11 +409,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 #endif
 	struct ncp_entry_info finfo;
 
-	server = kmalloc(sizeof(struct ncp_server), GFP_KERNEL);
+	server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL);
 	if (!server)
 		return -ENOMEM;
 	sb->s_fs_info = server;
-	memset(server, 0, sizeof(struct ncp_server));
 
 	error = -EFAULT;
 	if (raw_data == NULL)
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 42039fe0653c..589d1eac55c1 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -7,19 +7,21 @@
  *
  */
 
-
-#include <asm/uaccess.h>
 #include <linux/capability.h>
+#include <linux/compat.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/ioctl.h>
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/highuid.h>
+#include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
 
 #include <linux/ncp_fs.h>
 
+#include <asm/uaccess.h>
+
 #include "ncplib_kernel.h"
 
 /* maximum limit for ncp_objectname_ioctl */
@@ -89,6 +91,82 @@ ncp_get_fs_info_v2(struct ncp_server * server, struct file *file,
 	return 0;
 }
 
+#ifdef CONFIG_COMPAT
+struct compat_ncp_objectname_ioctl
+{
+	s32		auth_type;
+	u32		object_name_len;
+	compat_caddr_t	object_name;	/* an userspace data, in most cases user name */
+};
+
+struct compat_ncp_fs_info_v2 {
+	s32 version;
+	u32 mounted_uid;
+	u32 connection;
+	u32 buffer_size;
+
+	u32 volume_number;
+	u32 directory_id;
+
+	u32 dummy1;
+	u32 dummy2;
+	u32 dummy3;
+};
+
+struct compat_ncp_ioctl_request {
+	u32 function;
+	u32 size;
+	compat_caddr_t data;
+};
+
+struct compat_ncp_privatedata_ioctl
+{
+	u32		len;
+	compat_caddr_t	data;		/* ~1000 for NDS */
+};
+
+#define NCP_IOC_GET_FS_INFO_V2_32	_IOWR('n', 4, struct compat_ncp_fs_info_v2)
+#define NCP_IOC_NCPREQUEST_32		_IOR('n', 1, struct compat_ncp_ioctl_request)
+#define NCP_IOC_GETOBJECTNAME_32	_IOWR('n', 9, struct compat_ncp_objectname_ioctl)
+#define NCP_IOC_SETOBJECTNAME_32	_IOR('n', 9, struct compat_ncp_objectname_ioctl)
+#define NCP_IOC_GETPRIVATEDATA_32	_IOWR('n', 10, struct compat_ncp_privatedata_ioctl)
+#define NCP_IOC_SETPRIVATEDATA_32	_IOR('n', 10, struct compat_ncp_privatedata_ioctl)
+
+static int
+ncp_get_compat_fs_info_v2(struct ncp_server * server, struct file *file,
+		   struct compat_ncp_fs_info_v2 __user * arg)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct compat_ncp_fs_info_v2 info2;
+
+	if ((file_permission(file, MAY_WRITE) != 0)
+	    && (current->uid != server->m.mounted_uid)) {
+		return -EACCES;
+	}
+	if (copy_from_user(&info2, arg, sizeof(info2)))
+		return -EFAULT;
+
+	if (info2.version != NCP_GET_FS_INFO_VERSION_V2) {
+		DPRINTK("info.version invalid: %d\n", info2.version);
+		return -EINVAL;
+	}
+	info2.mounted_uid   = server->m.mounted_uid;
+	info2.connection    = server->connection;
+	info2.buffer_size   = server->buffer_size;
+	info2.volume_number = NCP_FINFO(inode)->volNumber;
+	info2.directory_id  = NCP_FINFO(inode)->DosDirNum;
+	info2.dummy1 = info2.dummy2 = info2.dummy3 = 0;
+
+	if (copy_to_user(arg, &info2, sizeof(info2)))
+		return -EFAULT;
+	return 0;
+}
+#endif
+
+#define NCP_IOC_GETMOUNTUID16		_IOW('n', 2, u16)
+#define NCP_IOC_GETMOUNTUID32		_IOW('n', 2, u32)
+#define NCP_IOC_GETMOUNTUID64		_IOW('n', 2, u64)
+
 #ifdef CONFIG_NCPFS_NLS
 /* Here we are select the iocharset and the codepage for NLS.
  * Thanks Petr Vandrovec for idea and many hints.
@@ -192,12 +270,24 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
 	void __user *argp = (void __user *)arg;
 
 	switch (cmd) {
+#ifdef CONFIG_COMPAT
+	case NCP_IOC_NCPREQUEST_32:
+#endif
 	case NCP_IOC_NCPREQUEST:
-
 		if ((file_permission(filp, MAY_WRITE) != 0)
 		    && (current->uid != server->m.mounted_uid)) {
 			return -EACCES;
 		}
+#ifdef CONFIG_COMPAT
+		if (cmd == NCP_IOC_NCPREQUEST_32) {
+			struct compat_ncp_ioctl_request request32;
+			if (copy_from_user(&request32, argp, sizeof(request32)))
+				return -EFAULT;
+			request.function = request32.function;
+			request.size = request32.size;
+			request.data = compat_ptr(request32.data);
+		} else
+#endif
 		if (copy_from_user(&request, argp, sizeof(request)))
 			return -EFAULT;
 
@@ -254,19 +344,35 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
 	case NCP_IOC_GET_FS_INFO_V2:
 		return ncp_get_fs_info_v2(server, filp, argp);
 
-	case NCP_IOC_GETMOUNTUID2:
-		{
-			unsigned long tmp = server->m.mounted_uid;
-
-			if ((file_permission(filp, MAY_READ) != 0)
-			    && (current->uid != server->m.mounted_uid))
-			{
-				return -EACCES;
-			}
-			if (put_user(tmp, (unsigned long __user *)argp)) 
+#ifdef CONFIG_COMPAT
+	case NCP_IOC_GET_FS_INFO_V2_32:
+		return ncp_get_compat_fs_info_v2(server, filp, argp);
+#endif
+	/* we have too many combinations of CONFIG_COMPAT,
+	 * CONFIG_64BIT and CONFIG_UID16, so just handle
+	 * any of the possible ioctls */
+	case NCP_IOC_GETMOUNTUID16:
+	case NCP_IOC_GETMOUNTUID32:
+	case NCP_IOC_GETMOUNTUID64:
+		if ((file_permission(filp, MAY_READ) != 0)
+			&& (current->uid != server->m.mounted_uid)) {
+			return -EACCES;
+		}
+		if (cmd == NCP_IOC_GETMOUNTUID16) {
+			u16 uid;
+			SET_UID(uid, server->m.mounted_uid);
+			if (put_user(uid, (u16 __user *)argp))
+				return -EFAULT;
+		} else if (cmd == NCP_IOC_GETMOUNTUID32) {
+			if (put_user(server->m.mounted_uid,
+						(u32 __user *)argp))
+				return -EFAULT;
+		} else {
+			if (put_user(server->m.mounted_uid,
+						(u64 __user *)argp))
 				return -EFAULT;
-			return 0;
 		}
+		return 0;
 
 	case NCP_IOC_GETROOT:
 		{
@@ -476,6 +582,32 @@ outrel:
 		}
 #endif	/* CONFIG_NCPFS_IOCTL_LOCKING */
 
+#ifdef CONFIG_COMPAT
+	case NCP_IOC_GETOBJECTNAME_32:
+		if (current->uid != server->m.mounted_uid) {
+			return -EACCES;
+		}
+		{
+			struct compat_ncp_objectname_ioctl user;
+			size_t outl;
+
+			if (copy_from_user(&user, argp, sizeof(user)))
+				return -EFAULT;
+			user.auth_type = server->auth.auth_type;
+			outl = user.object_name_len;
+			user.object_name_len = server->auth.object_name_len;
+			if (outl > user.object_name_len)
+				outl = user.object_name_len;
+			if (outl) {
+				if (copy_to_user(compat_ptr(user.object_name),
+						 server->auth.object_name,
+						 outl)) return -EFAULT;
+			}
+			if (copy_to_user(argp, &user, sizeof(user)))
+				return -EFAULT;
+			return 0;
+		}
+#endif
 	case NCP_IOC_GETOBJECTNAME:
 		if (current->uid != server->m.mounted_uid) {
 			return -EACCES;
@@ -500,6 +632,9 @@ outrel:
 				return -EFAULT;
 			return 0;
 		}
+#ifdef CONFIG_COMPAT
+	case NCP_IOC_SETOBJECTNAME_32:
+#endif
 	case NCP_IOC_SETOBJECTNAME:
 		if (current->uid != server->m.mounted_uid) {
 			return -EACCES;
@@ -512,8 +647,19 @@ outrel:
 			void* oldprivate;
 			size_t oldprivatelen;
 
+#ifdef CONFIG_COMPAT
+			if (cmd == NCP_IOC_SETOBJECTNAME_32) {
+				struct compat_ncp_objectname_ioctl user32;
+				if (copy_from_user(&user32, argp, sizeof(user32)))
+					return -EFAULT;
+				user.auth_type = user32.auth_type;
+				user.object_name_len = user32.object_name_len;
+				user.object_name = compat_ptr(user32.object_name);
+			} else
+#endif
 			if (copy_from_user(&user, argp, sizeof(user)))
 				return -EFAULT;
+
 			if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN)
 				return -ENOMEM;
 			if (user.object_name_len) {
@@ -544,6 +690,9 @@ outrel:
 			kfree(oldname);
 			return 0;
 		}
+#ifdef CONFIG_COMPAT
+	case NCP_IOC_GETPRIVATEDATA_32:
+#endif
 	case NCP_IOC_GETPRIVATEDATA:
 		if (current->uid != server->m.mounted_uid) {
 			return -EACCES;
@@ -552,8 +701,18 @@ outrel:
 			struct ncp_privatedata_ioctl user;
 			size_t outl;
 
+#ifdef CONFIG_COMPAT
+			if (cmd == NCP_IOC_GETPRIVATEDATA_32) {
+				struct compat_ncp_privatedata_ioctl user32;
+				if (copy_from_user(&user32, argp, sizeof(user32)))
+					return -EFAULT;
+				user.len = user32.len;
+				user.data = compat_ptr(user32.data);
+			} else
+#endif
 			if (copy_from_user(&user, argp, sizeof(user)))
 				return -EFAULT;
+
 			outl = user.len;
 			user.len = server->priv.len;
 			if (outl > user.len) outl = user.len;
@@ -562,10 +721,23 @@ outrel:
 						 server->priv.data,
 						 outl)) return -EFAULT;
 			}
+#ifdef CONFIG_COMPAT
+			if (cmd == NCP_IOC_GETPRIVATEDATA_32) {
+				struct compat_ncp_privatedata_ioctl user32;
+				user32.len = user.len;
+				user32.data = (unsigned long) user.data;
+				if (copy_to_user(argp, &user32, sizeof(user32)))
+					return -EFAULT;
+			} else
+#endif
 			if (copy_to_user(argp, &user, sizeof(user)))
 				return -EFAULT;
+
 			return 0;
 		}
+#ifdef CONFIG_COMPAT
+	case NCP_IOC_SETPRIVATEDATA_32:
+#endif
 	case NCP_IOC_SETPRIVATEDATA:
 		if (current->uid != server->m.mounted_uid) {
 			return -EACCES;
@@ -576,8 +748,18 @@ outrel:
 			void* old;
 			size_t oldlen;
 
+#ifdef CONFIG_COMPAT
+			if (cmd == NCP_IOC_SETPRIVATEDATA_32) {
+				struct compat_ncp_privatedata_ioctl user32;
+				if (copy_from_user(&user32, argp, sizeof(user32)))
+					return -EFAULT;
+				user.len = user32.len;
+				user.data = compat_ptr(user32.data);
+			} else
+#endif
 			if (copy_from_user(&user, argp, sizeof(user)))
 				return -EFAULT;
+
 			if (user.len > NCP_PRIVATE_DATA_MAX_LEN)
 				return -ENOMEM;
 			if (user.len) {
@@ -636,20 +818,19 @@ outrel:
 		}
 
 	}
-/* #ifdef CONFIG_UID16 */
-	/* NCP_IOC_GETMOUNTUID may be same as NCP_IOC_GETMOUNTUID2,
-           so we have this out of switch */
-	if (cmd == NCP_IOC_GETMOUNTUID) {
-		__kernel_uid_t uid = 0;
-		if ((file_permission(filp, MAY_READ) != 0)
-		    && (current->uid != server->m.mounted_uid)) {
-			return -EACCES;
-		}
-		SET_UID(uid, server->m.mounted_uid);
-		if (put_user(uid, (__kernel_uid_t __user *)argp))
-			return -EFAULT;
-		return 0;
-	}
-/* #endif */
 	return -EINVAL;
 }
+
+#ifdef CONFIG_COMPAT
+long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	int ret;
+
+	lock_kernel();
+	arg = (unsigned long) compat_ptr(arg);
+	ret = ncp_ioctl(inode, file, cmd, arg);
+	unlock_kernel();
+	return ret;
+}
+#endif
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index ca92c2406635..e3d26c1bd105 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -48,7 +48,7 @@ static int ncp_symlink_readpage(struct file *file, struct page *page)
 	char *buf = kmap(page);
 
 	error = -ENOMEM;
-	rawlink=(char *)kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL);
+	rawlink = kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL);
 	if (!rawlink)
 		goto fail;
 
@@ -126,7 +126,7 @@ int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) {
 	/* EPERM is returned by VFS if symlink procedure does not exist */
 		return -EPERM;
   
-	rawlink=(char *)kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL);
+	rawlink = kmalloc(NCP_MAX_SYMLINK_SIZE, GFP_KERNEL);
 	if (!rawlink)
 		return -ENOMEM;
 
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index a3ee11364db0..7933e2e99dbc 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -58,7 +58,6 @@ module_param_call(callback_tcpport, param_set_port, param_get_int,
  */
 static void nfs_callback_svc(struct svc_rqst *rqstp)
 {
-	struct svc_serv *serv = rqstp->rq_server;
 	int err;
 
 	__module_get(THIS_MODULE);
@@ -80,7 +79,7 @@ static void nfs_callback_svc(struct svc_rqst *rqstp)
 		/*
 		 * Listen for a request on the socket
 		 */
-		err = svc_recv(serv, rqstp, MAX_SCHEDULE_TIMEOUT);
+		err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT);
 		if (err == -EAGAIN || err == -EINTR)
 			continue;
 		if (err < 0) {
@@ -91,7 +90,7 @@ static void nfs_callback_svc(struct svc_rqst *rqstp)
 		}
 		dprintk("%s: request from %u.%u.%u.%u\n", __FUNCTION__,
 				NIPQUAD(rqstp->rq_addr.sin_addr.s_addr));
-		svc_process(serv, rqstp);
+		svc_process(rqstp);
 	}
 
 	svc_exit_thread(rqstp);
@@ -116,7 +115,7 @@ int nfs_callback_up(void)
 		goto out;
 	init_completion(&nfs_callback_info.started);
 	init_completion(&nfs_callback_info.stopped);
-	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE);
+	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
 	ret = -ENOMEM;
 	if (!serv)
 		goto out_err;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 5676163d26e8..db3d7919c601 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -31,10 +31,10 @@ struct cb_compound_hdr_arg {
 };
 
 struct cb_compound_hdr_res {
-	uint32_t *status;
+	__be32 *status;
 	int taglen;
 	const char *tag;
-	uint32_t *nops;
+	__be32 *nops;
 };
 
 struct cb_getattrargs {
@@ -44,7 +44,7 @@ struct cb_getattrargs {
 };
 
 struct cb_getattrres {
-	uint32_t status;
+	__be32 status;
 	uint32_t bitmap[2];
 	uint64_t size;
 	uint64_t change_attr;
@@ -59,8 +59,8 @@ struct cb_recallargs {
 	uint32_t truncate;
 };
 
-extern unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
-extern unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
+extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res);
+extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy);
 
 #ifdef CONFIG_NFS_V4
 extern int nfs_callback_up(void);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 97cf8f71451f..72e55d83756d 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -14,7 +14,7 @@
 
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
  
-unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
+__be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res)
 {
 	struct nfs_client *clp;
 	struct nfs_delegation *delegation;
@@ -55,11 +55,11 @@ out:
 	return res->status;
 }
 
-unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
+__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
 {
 	struct nfs_client *clp;
 	struct inode *inode;
-	unsigned res;
+	__be32 res;
 	
 	res = htonl(NFS4ERR_BADHANDLE);
 	clp = nfs_find_client(args->addr, 4);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 29f932192054..f8ea1f51f590 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -22,9 +22,9 @@
 
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
 
-typedef unsigned (*callback_process_op_t)(void *, void *);
-typedef unsigned (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
-typedef unsigned (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
+typedef __be32 (*callback_process_op_t)(void *, void *);
+typedef __be32 (*callback_decode_arg_t)(struct svc_rqst *, struct xdr_stream *, void *);
+typedef __be32 (*callback_encode_res_t)(struct svc_rqst *, struct xdr_stream *, void *);
 
 
 struct callback_op {
@@ -36,24 +36,24 @@ struct callback_op {
 
 static struct callback_op callback_ops[];
 
-static int nfs4_callback_null(struct svc_rqst *rqstp, void *argp, void *resp)
+static __be32 nfs4_callback_null(struct svc_rqst *rqstp, void *argp, void *resp)
 {
 	return htonl(NFS4_OK);
 }
 
-static int nfs4_decode_void(struct svc_rqst *rqstp, uint32_t *p, void *dummy)
+static int nfs4_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
 	return xdr_argsize_check(rqstp, p);
 }
 
-static int nfs4_encode_void(struct svc_rqst *rqstp, uint32_t *p, void *dummy)
+static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
 	return xdr_ressize_check(rqstp, p);
 }
 
-static uint32_t *read_buf(struct xdr_stream *xdr, int nbytes)
+static __be32 *read_buf(struct xdr_stream *xdr, int nbytes)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	p = xdr_inline_decode(xdr, nbytes);
 	if (unlikely(p == NULL))
@@ -61,9 +61,9 @@ static uint32_t *read_buf(struct xdr_stream *xdr, int nbytes)
 	return p;
 }
 
-static unsigned decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str)
+static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	p = read_buf(xdr, 4);
 	if (unlikely(p == NULL))
@@ -81,9 +81,9 @@ static unsigned decode_string(struct xdr_stream *xdr, unsigned int *len, const c
 	return 0;
 }
 
-static unsigned decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
+static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	p = read_buf(xdr, 4);
 	if (unlikely(p == NULL))
@@ -99,9 +99,9 @@ static unsigned decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
 	return 0;
 }
 
-static unsigned decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
+static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
 {
-	uint32_t *p;
+	__be32 *p;
 	unsigned int attrlen;
 
 	p = read_buf(xdr, 4);
@@ -118,9 +118,9 @@ static unsigned decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
 	return 0;
 }
 
-static unsigned decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	p = read_buf(xdr, 16);
 	if (unlikely(p == NULL))
@@ -129,11 +129,11 @@ static unsigned decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
 	return 0;
 }
 
-static unsigned decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
+static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr)
 {
-	uint32_t *p;
+	__be32 *p;
 	unsigned int minor_version;
-	unsigned status;
+	__be32 status;
 
 	status = decode_string(xdr, &hdr->taglen, &hdr->tag);
 	if (unlikely(status != 0))
@@ -159,9 +159,9 @@ static unsigned decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compou
 	return 0;
 }
 
-static unsigned decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
+static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
 {
-	uint32_t *p;
+	__be32 *p;
 	p = read_buf(xdr, 4);
 	if (unlikely(p == NULL))
 		return htonl(NFS4ERR_RESOURCE);
@@ -169,9 +169,9 @@ static unsigned decode_op_hdr(struct xdr_stream *xdr, unsigned int *op)
 	return 0;
 }
 
-static unsigned decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_getattrargs *args)
+static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_getattrargs *args)
 {
-	unsigned status;
+	__be32 status;
 
 	status = decode_fh(xdr, &args->fh);
 	if (unlikely(status != 0))
@@ -183,10 +183,10 @@ out:
 	return status;
 }
 
-static unsigned decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args)
+static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args)
 {
-	uint32_t *p;
-	unsigned status;
+	__be32 *p;
+	__be32 status;
 
 	args->addr = &rqstp->rq_addr;
 	status = decode_stateid(xdr, &args->stateid);
@@ -204,9 +204,9 @@ out:
 	return status;
 }
 
-static unsigned encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
+static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	p = xdr_reserve_space(xdr, 4 + len);
 	if (unlikely(p == NULL))
@@ -217,10 +217,10 @@ static unsigned encode_string(struct xdr_stream *xdr, unsigned int len, const ch
 
 #define CB_SUPPORTED_ATTR0 (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE)
 #define CB_SUPPORTED_ATTR1 (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY)
-static unsigned encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, uint32_t **savep)
+static __be32 encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitmap, __be32 **savep)
 {
-	uint32_t bm[2];
-	uint32_t *p;
+	__be32 bm[2];
+	__be32 *p;
 
 	bm[0] = htonl(bitmap[0] & CB_SUPPORTED_ATTR0);
 	bm[1] = htonl(bitmap[1] & CB_SUPPORTED_ATTR1);
@@ -247,9 +247,9 @@ static unsigned encode_attr_bitmap(struct xdr_stream *xdr, const uint32_t *bitma
 	return 0;
 }
 
-static unsigned encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t change)
+static __be32 encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t change)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	if (!(bitmap[0] & FATTR4_WORD0_CHANGE))
 		return 0;
@@ -260,9 +260,9 @@ static unsigned encode_attr_change(struct xdr_stream *xdr, const uint32_t *bitma
 	return 0;
 }
 
-static unsigned encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t size)
+static __be32 encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, uint64_t size)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	if (!(bitmap[0] & FATTR4_WORD0_SIZE))
 		return 0;
@@ -273,9 +273,9 @@ static unsigned encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap,
 	return 0;
 }
 
-static unsigned encode_attr_time(struct xdr_stream *xdr, const struct timespec *time)
+static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec *time)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	p = xdr_reserve_space(xdr, 12);
 	if (unlikely(p == 0))
@@ -285,23 +285,23 @@ static unsigned encode_attr_time(struct xdr_stream *xdr, const struct timespec *
 	return 0;
 }
 
-static unsigned encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
+static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
 {
 	if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA))
 		return 0;
 	return encode_attr_time(xdr,time);
 }
 
-static unsigned encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
+static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time)
 {
 	if (!(bitmap[1] & FATTR4_WORD1_TIME_MODIFY))
 		return 0;
 	return encode_attr_time(xdr,time);
 }
 
-static unsigned encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr)
+static __be32 encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compound_hdr_res *hdr)
 {
-	unsigned status;
+	__be32 status;
 
 	hdr->status = xdr_reserve_space(xdr, 4);
 	if (unlikely(hdr->status == NULL))
@@ -315,9 +315,9 @@ static unsigned encode_compound_hdr_res(struct xdr_stream *xdr, struct cb_compou
 	return 0;
 }
 
-static unsigned encode_op_hdr(struct xdr_stream *xdr, uint32_t op, uint32_t res)
+static __be32 encode_op_hdr(struct xdr_stream *xdr, uint32_t op, __be32 res)
 {
-	uint32_t *p;
+	__be32 *p;
 	
 	p = xdr_reserve_space(xdr, 8);
 	if (unlikely(p == NULL))
@@ -327,10 +327,10 @@ static unsigned encode_op_hdr(struct xdr_stream *xdr, uint32_t op, uint32_t res)
 	return 0;
 }
 
-static unsigned encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_getattrres *res)
+static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, const struct cb_getattrres *res)
 {
-	uint32_t *savep = NULL;
-	unsigned status = res->status;
+	__be32 *savep = NULL;
+	__be32 status = res->status;
 	
 	if (unlikely(status != 0))
 		goto out;
@@ -353,15 +353,15 @@ out:
 	return status;
 }
 
-static unsigned process_op(struct svc_rqst *rqstp,
+static __be32 process_op(struct svc_rqst *rqstp,
 		struct xdr_stream *xdr_in, void *argp,
 		struct xdr_stream *xdr_out, void *resp)
 {
 	struct callback_op *op = &callback_ops[0];
 	unsigned int op_nr = OP_CB_ILLEGAL;
-	unsigned int status = 0;
+	__be32 status = 0;
 	long maxlen;
-	unsigned res;
+	__be32 res;
 
 	dprintk("%s: start\n", __FUNCTION__);
 	status = decode_op_hdr(xdr_in, &op_nr);
@@ -399,20 +399,20 @@ static unsigned process_op(struct svc_rqst *rqstp,
 /*
  * Decode, process and encode a COMPOUND
  */
-static int nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp)
+static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp)
 {
 	struct cb_compound_hdr_arg hdr_arg;
 	struct cb_compound_hdr_res hdr_res;
 	struct xdr_stream xdr_in, xdr_out;
-	uint32_t *p;
-	unsigned int status;
+	__be32 *p;
+	__be32 status;
 	unsigned int nops = 1;
 
 	dprintk("%s: start\n", __FUNCTION__);
 
 	xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base);
 
-	p = (uint32_t*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
+	p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
 	xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
 
 	decode_compound_hdr_arg(&xdr_in, &hdr_arg);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ec1938d4b814..5fea638743e4 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -10,7 +10,6 @@
  */
 
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
 
@@ -233,11 +232,15 @@ void nfs_put_client(struct nfs_client *clp)
  * Find a client by address
  * - caller must hold nfs_client_lock
  */
-static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int nfsversion)
+static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int nfsversion, int match_port)
 {
 	struct nfs_client *clp;
 
 	list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+		/* Don't match clients that failed to initialise properly */
+		if (clp->cl_cons_state < 0)
+			continue;
+
 		/* Different NFS versions cannot share the same nfs_client */
 		if (clp->cl_nfsversion != nfsversion)
 			continue;
@@ -246,7 +249,7 @@ static struct nfs_client *__nfs_find_client(const struct sockaddr_in *addr, int
 			   sizeof(clp->cl_addr.sin_addr)) != 0)
 			continue;
 
-		if (clp->cl_addr.sin_port == addr->sin_port)
+		if (!match_port || clp->cl_addr.sin_port == addr->sin_port)
 			goto found;
 	}
 
@@ -266,11 +269,12 @@ struct nfs_client *nfs_find_client(const struct sockaddr_in *addr, int nfsversio
 	struct nfs_client *clp;
 
 	spin_lock(&nfs_client_lock);
-	clp = __nfs_find_client(addr, nfsversion);
+	clp = __nfs_find_client(addr, nfsversion, 0);
 	spin_unlock(&nfs_client_lock);
-
-	BUG_ON(clp && clp->cl_cons_state == 0);
-
+	if (clp != NULL && clp->cl_cons_state != NFS_CS_READY) {
+		nfs_put_client(clp);
+		clp = NULL;
+	}
 	return clp;
 }
 
@@ -293,7 +297,7 @@ static struct nfs_client *nfs_get_client(const char *hostname,
 	do {
 		spin_lock(&nfs_client_lock);
 
-		clp = __nfs_find_client(addr, nfsversion);
+		clp = __nfs_find_client(addr, nfsversion, 1);
 		if (clp)
 			goto found_client;
 		if (new)
@@ -323,25 +327,11 @@ found_client:
 	if (new)
 		nfs_free_client(new);
 
-	if (clp->cl_cons_state == NFS_CS_INITING) {
-		DECLARE_WAITQUEUE(myself, current);
-
-		add_wait_queue(&nfs_client_active_wq, &myself);
-
-		for (;;) {
-			set_current_state(TASK_INTERRUPTIBLE);
-			if (signal_pending(current) ||
-			    clp->cl_cons_state > NFS_CS_READY)
-				break;
-			schedule();
-		}
-
-		remove_wait_queue(&nfs_client_active_wq, &myself);
-
-		if (signal_pending(current)) {
-			nfs_put_client(clp);
-			return ERR_PTR(-ERESTARTSYS);
-		}
+	error = wait_event_interruptible(nfs_client_active_wq,
+				clp->cl_cons_state != NFS_CS_INITING);
+	if (error < 0) {
+		nfs_put_client(clp);
+		return ERR_PTR(-ERESTARTSYS);
 	}
 
 	if (clp->cl_cons_state < NFS_CS_READY) {
@@ -460,7 +450,8 @@ static int nfs_start_lockd(struct nfs_server *server)
 		goto out;
 	if (server->flags & NFS_MOUNT_NONLM)
 		goto out;
-	error = lockd_up();
+	error = lockd_up((server->flags & NFS_MOUNT_TCP) ?
+			IPPROTO_TCP : IPPROTO_UDP);
 	if (error < 0)
 		server->flags |= NFS_MOUNT_NONLM;
 	else
@@ -863,6 +854,7 @@ error:
  */
 static int nfs4_init_client(struct nfs_client *clp,
 		int proto, int timeo, int retrans,
+		const char *ip_addr,
 		rpc_authflavor_t authflavour)
 {
 	int error;
@@ -879,6 +871,7 @@ static int nfs4_init_client(struct nfs_client *clp,
 	error = nfs_create_rpc_client(clp, proto, timeo, retrans, authflavour);
 	if (error < 0)
 		goto error;
+	memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
 
 	error = nfs_idmap_new(clp);
 	if (error < 0) {
@@ -902,6 +895,7 @@ error:
  */
 static int nfs4_set_client(struct nfs_server *server,
 		const char *hostname, const struct sockaddr_in *addr,
+		const char *ip_addr,
 		rpc_authflavor_t authflavour,
 		int proto, int timeo, int retrans)
 {
@@ -916,7 +910,7 @@ static int nfs4_set_client(struct nfs_server *server,
 		error = PTR_ERR(clp);
 		goto error;
 	}
-	error = nfs4_init_client(clp, proto, timeo, retrans, authflavour);
+	error = nfs4_init_client(clp, proto, timeo, retrans, ip_addr, authflavour);
 	if (error < 0)
 		goto error_put;
 
@@ -985,7 +979,7 @@ struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data,
 		return ERR_PTR(-ENOMEM);
 
 	/* Get a client record */
-	error = nfs4_set_client(server, hostname, addr, authflavour,
+	error = nfs4_set_client(server, hostname, addr, ip_addr, authflavour,
 			data->proto, data->timeo, data->retrans);
 	if (error < 0)
 		goto error;
@@ -1055,6 +1049,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 	/* Get a client representation.
 	 * Note: NFSv4 always uses TCP, */
 	error = nfs4_set_client(server, data->hostname, data->addr,
+			parent_client->cl_ipaddr,
 			data->authflavor,
 			parent_server->client->cl_xprt->prot,
 			parent_client->retrans_timeo,
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 57133678db16..841c99a9b11c 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -20,11 +20,6 @@
 #include "delegation.h"
 #include "internal.h"
 
-static struct nfs_delegation *nfs_alloc_delegation(void)
-{
-	return (struct nfs_delegation *)kmalloc(sizeof(struct nfs_delegation), GFP_KERNEL);
-}
-
 static void nfs_free_delegation(struct nfs_delegation *delegation)
 {
 	if (delegation->cred)
@@ -124,7 +119,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 	if ((nfsi->cache_validity & (NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATTR)))
 		__nfs_revalidate_inode(NFS_SERVER(inode), inode);
 
-	delegation = nfs_alloc_delegation();
+	delegation = kmalloc(sizeof(*delegation), GFP_KERNEL);
 	if (delegation == NULL)
 		return -ENOMEM;
 	memcpy(delegation->stateid.data, res->delegation.data,
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7432f1a43f3d..b34cd16f472f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -142,12 +142,12 @@ nfs_opendir(struct inode *inode, struct file *filp)
 	return res;
 }
 
-typedef u32 * (*decode_dirent_t)(u32 *, struct nfs_entry *, int);
+typedef __be32 * (*decode_dirent_t)(__be32 *, struct nfs_entry *, int);
 typedef struct {
 	struct file	*file;
 	struct page	*page;
 	unsigned long	page_index;
-	u32		*ptr;
+	__be32		*ptr;
 	u64		*dir_cookie;
 	loff_t		current_index;
 	struct nfs_entry *entry;
@@ -203,8 +203,10 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 	 * Note: assumes we have exclusive access to this mapping either
 	 *	 through inode->i_mutex or some other mechanism.
 	 */
-	if (page->index == 0)
-		invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1);
+	if (page->index == 0 && invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1) < 0) {
+		/* Should never happen */
+		nfs_zap_mapping(inode, inode->i_mapping);
+	}
 	unlock_page(page);
 	return 0;
  error:
@@ -218,7 +220,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 static inline
 int dir_decode(nfs_readdir_descriptor_t *desc)
 {
-	u32	*p = desc->ptr;
+	__be32	*p = desc->ptr;
 	p = desc->decode(p, desc->entry, desc->plus);
 	if (IS_ERR(p))
 		return PTR_ERR(p);
@@ -843,7 +845,7 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
 	nfs_inode_return_delegation(inode);
 	if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
 		lock_kernel();
-		inode->i_nlink--;
+		drop_nlink(inode);
 		nfs_complete_unlink(dentry);
 		unlock_kernel();
 	}
@@ -933,8 +935,17 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
 
 no_entry:
 	res = d_materialise_unique(dentry, inode);
-	if (res != NULL)
+	if (res != NULL) {
+		struct dentry *parent;
+		if (IS_ERR(res))
+			goto out_unlock;
+		/* Was a directory renamed! */
+		parent = dget_parent(res);
+		if (!IS_ROOT(parent))
+			nfs_mark_for_revalidate(parent->d_inode);
+		dput(parent);
 		dentry = res;
+	}
 	nfs_renew_times(dentry);
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 out_unlock:
@@ -1130,6 +1141,8 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc)
 	alias = d_materialise_unique(dentry, inode);
 	if (alias != NULL) {
 		dput(dentry);
+		if (IS_ERR(alias))
+			return NULL;
 		dentry = alias;
 	}
 
@@ -1286,7 +1299,7 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
 	error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
 	/* Ensure the VFS deletes this inode */
 	if (error == 0 && dentry->d_inode != NULL)
-		dentry->d_inode->i_nlink = 0;
+		clear_nlink(dentry->d_inode);
 	nfs_end_data_update(dir);
 	unlock_kernel();
 
@@ -1401,7 +1414,7 @@ static int nfs_safe_remove(struct dentry *dentry)
 		error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
 		/* The VFS may want to delete this inode */
 		if (error == 0)
-			inode->i_nlink--;
+			drop_nlink(inode);
 		nfs_mark_for_revalidate(inode);
 		nfs_end_data_update(inode);
 	} else
@@ -1517,8 +1530,8 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
 	pagevec_init(&lru_pvec, 0);
 	if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
 							GFP_KERNEL)) {
-		if (!pagevec_add(&lru_pvec, page))
-			__pagevec_lru_add(&lru_pvec);
+		pagevec_add(&lru_pvec, page);
+		pagevec_lru_add(&lru_pvec);
 		SetPageUptodate(page);
 		unlock_page(page);
 	} else
@@ -1639,7 +1652,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto out;
 		}
 	} else
-		new_inode->i_nlink--;
+		drop_nlink(new_inode);
 
 go_ahead:
 	/*
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 76ca1cbc38f9..bdfabf854a51 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -497,6 +497,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
 			if (dreq->commit_data != NULL)
 				nfs_commit_free(dreq->commit_data);
 			nfs_direct_free_writedata(dreq);
+			nfs_zap_mapping(inode, inode->i_mapping);
 			nfs_direct_complete(dreq);
 	}
 }
@@ -517,6 +518,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
 {
 	nfs_end_data_update(inode);
 	nfs_direct_free_writedata(dreq);
+	nfs_zap_mapping(inode, inode->i_mapping);
 	nfs_direct_complete(dreq);
 }
 #endif
@@ -532,10 +534,12 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 
 	spin_lock(&dreq->lock);
 
-	if (likely(status >= 0))
-		dreq->count += data->res.count;
-	else
-		dreq->error = task->tk_status;
+	if (unlikely(status < 0)) {
+		dreq->error = status;
+		goto out_unlock;
+	}
+
+	dreq->count += data->res.count;
 
 	if (data->res.verf->committed != NFS_FILE_SYNC) {
 		switch (dreq->flags) {
@@ -550,7 +554,7 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
 				}
 		}
 	}
-
+out_unlock:
 	spin_unlock(&dreq->lock);
 }
 
@@ -707,8 +711,8 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
 /**
  * nfs_file_direct_read - file direct read operation for NFS files
  * @iocb: target I/O control block
- * @buf: user's buffer into which to read data
- * @count: number of bytes to read
+ * @iov: vector of user buffers into which to read data
+ * @nr_segs: size of iov vector
  * @pos: byte offset in file where reading starts
  *
  * We use this function for direct reads instead of calling
@@ -725,17 +729,24 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, siz
  * client must read the updated atime from the server back into its
  * cache.
  */
-ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
+ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
 {
 	ssize_t retval = -EINVAL;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
+	/* XXX: temporary */
+	const char __user *buf = iov[0].iov_base;
+	size_t count = iov[0].iov_len;
 
 	dprintk("nfs: direct read(%s/%s, %lu@%Ld)\n",
 		file->f_dentry->d_parent->d_name.name,
 		file->f_dentry->d_name.name,
 		(unsigned long) count, (long long) pos);
 
+	if (nr_segs != 1)
+		return -EINVAL;
+
 	if (count < 0)
 		goto out;
 	retval = -EFAULT;
@@ -760,8 +771,8 @@ out:
 /**
  * nfs_file_direct_write - file direct write operation for NFS files
  * @iocb: target I/O control block
- * @buf: user's buffer from which to write data
- * @count: number of bytes to write
+ * @iov: vector of user buffers from which to write data
+ * @nr_segs: size of iov vector
  * @pos: byte offset in file where writing starts
  *
  * We use this function for direct writes instead of calling
@@ -782,17 +793,24 @@ out:
  * Note that O_APPEND is not supported for NFS direct writes, as there
  * is no atomic O_APPEND write facility in the NFS protocol.
  */
-ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
+ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
 {
 	ssize_t retval;
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
+	/* XXX: temporary */
+	const char __user *buf = iov[0].iov_base;
+	size_t count = iov[0].iov_len;
 
 	dfprintk(VFS, "nfs: direct write(%s/%s, %lu@%Ld)\n",
 		file->f_dentry->d_parent->d_name.name,
 		file->f_dentry->d_name.name,
 		(unsigned long) count, (long long) pos);
 
+	if (nr_segs != 1)
+		return -EINVAL;
+
 	retval = generic_write_checks(file, &pos, &count, 0);
 	if (retval)
 		goto out;
@@ -814,17 +832,6 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t
 
 	retval = nfs_direct_write(iocb, (unsigned long) buf, count, pos);
 
-	/*
-	 * XXX: nfs_end_data_update() already ensures this file's
-	 *      cached data is subsequently invalidated.  Do we really
-	 *      need to call invalidate_inode_pages2() again here?
-	 *
-	 *      For aio writes, this invalidation will almost certainly
-	 *      occur before the writes complete.  Kind of racey.
-	 */
-	if (mapping->nrpages)
-		invalidate_inode_pages2(mapping);
-
 	if (retval > 0)
 		iocb->ki_pos = pos + retval;
 
@@ -855,6 +862,5 @@ int __init nfs_init_directcache(void)
  */
 void nfs_destroy_directcache(void)
 {
-	if (kmem_cache_destroy(nfs_direct_cachep))
-		printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n");
+	kmem_cache_destroy(nfs_direct_cachep);
 }
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index be997d649127..cc93865cea93 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -41,8 +41,10 @@ static int nfs_file_release(struct inode *, struct file *);
 static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin);
 static int  nfs_file_mmap(struct file *, struct vm_area_struct *);
 static ssize_t nfs_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void *);
-static ssize_t nfs_file_read(struct kiocb *, char __user *, size_t, loff_t);
-static ssize_t nfs_file_write(struct kiocb *, const char __user *, size_t, loff_t);
+static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos);
+static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos);
 static int  nfs_file_flush(struct file *, fl_owner_t id);
 static int  nfs_fsync(struct file *, struct dentry *dentry, int datasync);
 static int nfs_check_flags(int flags);
@@ -53,8 +55,8 @@ const struct file_operations nfs_file_operations = {
 	.llseek		= nfs_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
-	.aio_read		= nfs_file_read,
-	.aio_write		= nfs_file_write,
+	.aio_read	= nfs_file_read,
+	.aio_write	= nfs_file_write,
 	.mmap		= nfs_file_mmap,
 	.open		= nfs_file_open,
 	.flush		= nfs_file_flush,
@@ -196,15 +198,17 @@ nfs_file_flush(struct file *file, fl_owner_t id)
 }
 
 static ssize_t
-nfs_file_read(struct kiocb *iocb, char __user * buf, size_t count, loff_t pos)
+nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
 {
 	struct dentry * dentry = iocb->ki_filp->f_dentry;
 	struct inode * inode = dentry->d_inode;
 	ssize_t result;
+	size_t count = iov_length(iov, nr_segs);
 
 #ifdef CONFIG_NFS_DIRECTIO
 	if (iocb->ki_filp->f_flags & O_DIRECT)
-		return nfs_file_direct_read(iocb, buf, count, pos);
+		return nfs_file_direct_read(iocb, iov, nr_segs, pos);
 #endif
 
 	dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n",
@@ -214,7 +218,7 @@ nfs_file_read(struct kiocb *iocb, char __user * buf, size_t count, loff_t pos)
 	result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
 	nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, count);
 	if (!result)
-		result = generic_file_aio_read(iocb, buf, count, pos);
+		result = generic_file_aio_read(iocb, iov, nr_segs, pos);
 	return result;
 }
 
@@ -336,24 +340,22 @@ const struct address_space_operations nfs_file_aops = {
 #endif
 };
 
-/* 
- * Write to a file (through the page cache).
- */
-static ssize_t
-nfs_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
+static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t pos)
 {
 	struct dentry * dentry = iocb->ki_filp->f_dentry;
 	struct inode * inode = dentry->d_inode;
 	ssize_t result;
+	size_t count = iov_length(iov, nr_segs);
 
 #ifdef CONFIG_NFS_DIRECTIO
 	if (iocb->ki_filp->f_flags & O_DIRECT)
-		return nfs_file_direct_write(iocb, buf, count, pos);
+		return nfs_file_direct_write(iocb, iov, nr_segs, pos);
 #endif
 
-	dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%lu)\n",
+	dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%Ld)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name,
-		inode->i_ino, (unsigned long) count, (unsigned long) pos);
+		inode->i_ino, (unsigned long) count, (long long) pos);
 
 	result = -EBUSY;
 	if (IS_SWAPFILE(inode))
@@ -372,7 +374,7 @@ nfs_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t
 		goto out;
 
 	nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count);
-	result = generic_file_aio_write(iocb, buf, count, pos);
+	result = generic_file_aio_write(iocb, iov, nr_segs, pos);
 out:
 	return result;
 
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 76b08ae9ed82..20c6f39ea38a 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -9,7 +9,6 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index e8c143d182c4..08cc4c5919ab 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -131,6 +131,15 @@ void nfs_zap_caches(struct inode *inode)
 	spin_unlock(&inode->i_lock);
 }
 
+void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
+{
+	if (mapping->nrpages != 0) {
+		spin_lock(&inode->i_lock);
+		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
+		spin_unlock(&inode->i_lock);
+	}
+}
+
 static void nfs_zap_acl_cache(struct inode *inode)
 {
 	void (*clear_acl_cache)(struct inode *);
@@ -277,10 +286,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			 * report the blocks in 512byte units
 			 */
 			inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-			inode->i_blksize = inode->i_sb->s_blocksize;
 		} else {
 			inode->i_blocks = fattr->du.nfs2.blocks;
-			inode->i_blksize = fattr->du.nfs2.blocksize;
 		}
 		nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
 		nfsi->attrtimeo_timestamp = jiffies;
@@ -443,7 +450,7 @@ static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, str
 {
 	struct nfs_open_context *ctx;
 
-	ctx = (struct nfs_open_context *)kmalloc(sizeof(*ctx), GFP_KERNEL);
+	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 	if (ctx != NULL) {
 		atomic_set(&ctx->count, 1);
 		ctx->dentry = dget(dentry);
@@ -576,7 +583,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 
 	nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
 	lock_kernel();
-	if (!inode || is_bad_inode(inode))
+	if (is_bad_inode(inode))
  		goto out_nowait;
 	if (NFS_STALE(inode))
  		goto out_nowait;
@@ -673,13 +680,20 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 	if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
 			|| nfs_attribute_timeout(inode))
 		ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	if (ret < 0)
+		goto out;
 
 	if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
-		nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
-		if (S_ISREG(inode->i_mode))
-			nfs_sync_mapping(mapping);
-		invalidate_inode_pages2(mapping);
-
+		if (mapping->nrpages != 0) {
+			if (S_ISREG(inode->i_mode)) {
+				ret = nfs_sync_mapping(mapping);
+				if (ret < 0)
+					goto out;
+			}
+			ret = invalidate_inode_pages2(mapping);
+			if (ret < 0)
+				goto out;
+		}
 		spin_lock(&inode->i_lock);
 		nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
 		if (S_ISDIR(inode->i_mode)) {
@@ -689,10 +703,12 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 		}
 		spin_unlock(&inode->i_lock);
 
+		nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
 		dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
 				inode->i_sb->s_id,
 				(long long)NFS_FILEID(inode));
 	}
+out:
 	return ret;
 }
 
@@ -969,10 +985,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 		 * report the blocks in 512byte units
 		 */
 		inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-		inode->i_blksize = inode->i_sb->s_blocksize;
  	} else {
  		inode->i_blocks = fattr->du.nfs2.blocks;
- 		inode->i_blksize = fattr->du.nfs2.blocksize;
  	}
 
 	if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
@@ -1134,8 +1148,7 @@ static int __init nfs_init_inodecache(void)
 
 static void nfs_destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(nfs_inode_cachep))
-		printk(KERN_INFO "nfs_inode_cache: not all structures were freed\n");
+	kmem_cache_destroy(nfs_inode_cachep);
 }
 
 /*
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index bea0b016bd70..d205466233f6 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -93,15 +93,15 @@ extern void nfs_destroy_directcache(void);
 /* nfs2xdr.c */
 extern int nfs_stat_to_errno(int);
 extern struct rpc_procinfo nfs_procedures[];
-extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int);
+extern __be32 * nfs_decode_dirent(__be32 *, struct nfs_entry *, int);
 
 /* nfs3xdr.c */
 extern struct rpc_procinfo nfs3_procedures[];
-extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int);
+extern __be32 *nfs3_decode_dirent(__be32 *, struct nfs_entry *, int);
 
 /* nfs4xdr.c */
 #ifdef CONFIG_NFS_V4
-extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
+extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
 #endif
 
 /* nfs4proc.c */
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index d507b021207f..f75fe72b4160 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -95,7 +95,7 @@ mnt_create(char *hostname, struct sockaddr_in *srvaddr, int version,
  * XDR encode/decode functions for MOUNT
  */
 static int
-xdr_encode_dirpath(struct rpc_rqst *req, u32 *p, const char *path)
+xdr_encode_dirpath(struct rpc_rqst *req, __be32 *p, const char *path)
 {
 	p = xdr_encode_string(p, path);
 
@@ -104,7 +104,7 @@ xdr_encode_dirpath(struct rpc_rqst *req, u32 *p, const char *path)
 }
 
 static int
-xdr_decode_fhstatus(struct rpc_rqst *req, u32 *p, struct mnt_fhstatus *res)
+xdr_decode_fhstatus(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res)
 {
 	struct nfs_fh *fh = res->fh;
 
@@ -116,7 +116,7 @@ xdr_decode_fhstatus(struct rpc_rqst *req, u32 *p, struct mnt_fhstatus *res)
 }
 
 static int
-xdr_decode_fhstatus3(struct rpc_rqst *req, u32 *p, struct mnt_fhstatus *res)
+xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p, struct mnt_fhstatus *res)
 {
 	struct nfs_fh *fh = res->fh;
 
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 77b00684894d..ec1114b33d89 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -7,8 +7,6 @@
  * NFS namespace
  */
 
-#include <linux/config.h>
-
 #include <linux/dcache.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
@@ -26,6 +24,11 @@ LIST_HEAD(nfs_automount_list);
 static DECLARE_WORK(nfs_automount_task, nfs_expire_automounts, &nfs_automount_list);
 int nfs_mountpoint_expiry_timeout = 500 * HZ;
 
+static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
+					const struct dentry *dentry,
+					struct nfs_fh *fh,
+					struct nfs_fattr *fattr);
+
 /*
  * nfs_path - reconstruct the path given an arbitrary dentry
  * @base - arbitrary string to prepend to the path
@@ -209,9 +212,10 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
  * @fattr - attributes for new root inode
  *
  */
-struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
-		const struct dentry *dentry, struct nfs_fh *fh,
-		struct nfs_fattr *fattr)
+static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
+					const struct dentry *dentry,
+					struct nfs_fh *fh,
+					struct nfs_fattr *fattr)
 {
 	struct nfs_clone_mount mountdata = {
 		.sb = mnt_parent->mnt_sb,
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index b49501fc0a79..3be4e72a0227 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -66,15 +66,15 @@
 /*
  * Common NFS XDR functions as inlines
  */
-static inline u32 *
-xdr_encode_fhandle(u32 *p, struct nfs_fh *fhandle)
+static inline __be32 *
+xdr_encode_fhandle(__be32 *p, struct nfs_fh *fhandle)
 {
 	memcpy(p, fhandle->data, NFS2_FHSIZE);
 	return p + XDR_QUADLEN(NFS2_FHSIZE);
 }
 
-static inline u32 *
-xdr_decode_fhandle(u32 *p, struct nfs_fh *fhandle)
+static inline __be32 *
+xdr_decode_fhandle(__be32 *p, struct nfs_fh *fhandle)
 {
 	/* NFSv2 handles have a fixed length */
 	fhandle->size = NFS2_FHSIZE;
@@ -82,8 +82,8 @@ xdr_decode_fhandle(u32 *p, struct nfs_fh *fhandle)
 	return p + XDR_QUADLEN(NFS2_FHSIZE);
 }
 
-static inline u32*
-xdr_encode_time(u32 *p, struct timespec *timep)
+static inline __be32*
+xdr_encode_time(__be32 *p, struct timespec *timep)
 {
 	*p++ = htonl(timep->tv_sec);
 	/* Convert nanoseconds into microseconds */
@@ -91,8 +91,8 @@ xdr_encode_time(u32 *p, struct timespec *timep)
 	return p;
 }
 
-static inline u32*
-xdr_encode_current_server_time(u32 *p, struct timespec *timep)
+static inline __be32*
+xdr_encode_current_server_time(__be32 *p, struct timespec *timep)
 {
 	/*
 	 * Passing the invalid value useconds=1000000 is a
@@ -108,8 +108,8 @@ xdr_encode_current_server_time(u32 *p, struct timespec *timep)
 	return p;
 }
 
-static inline u32*
-xdr_decode_time(u32 *p, struct timespec *timep)
+static inline __be32*
+xdr_decode_time(__be32 *p, struct timespec *timep)
 {
 	timep->tv_sec = ntohl(*p++);
 	/* Convert microseconds into nanoseconds */
@@ -117,8 +117,8 @@ xdr_decode_time(u32 *p, struct timespec *timep)
 	return p;
 }
 
-static u32 *
-xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
+static __be32 *
+xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
 	u32 rdev;
 	fattr->type = (enum nfs_ftype) ntohl(*p++);
@@ -146,10 +146,10 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
 	return p;
 }
 
-static inline u32 *
-xdr_encode_sattr(u32 *p, struct iattr *attr)
+static inline __be32 *
+xdr_encode_sattr(__be32 *p, struct iattr *attr)
 {
-	const u32 not_set = __constant_htonl(0xFFFFFFFF);
+	const __be32 not_set = __constant_htonl(0xFFFFFFFF);
 
 	*p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set;
 	*p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set;
@@ -184,7 +184,7 @@ xdr_encode_sattr(u32 *p, struct iattr *attr)
  * GETATTR, READLINK, STATFS
  */
 static int
-nfs_xdr_fhandle(struct rpc_rqst *req, u32 *p, struct nfs_fh *fh)
+nfs_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
 {
 	p = xdr_encode_fhandle(p, fh);
 	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
@@ -195,7 +195,7 @@ nfs_xdr_fhandle(struct rpc_rqst *req, u32 *p, struct nfs_fh *fh)
  * Encode SETATTR arguments
  */
 static int
-nfs_xdr_sattrargs(struct rpc_rqst *req, u32 *p, struct nfs_sattrargs *args)
+nfs_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs_sattrargs *args)
 {
 	p = xdr_encode_fhandle(p, args->fh);
 	p = xdr_encode_sattr(p, args->sattr);
@@ -208,7 +208,7 @@ nfs_xdr_sattrargs(struct rpc_rqst *req, u32 *p, struct nfs_sattrargs *args)
  * LOOKUP, REMOVE, RMDIR
  */
 static int
-nfs_xdr_diropargs(struct rpc_rqst *req, u32 *p, struct nfs_diropargs *args)
+nfs_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs_diropargs *args)
 {
 	p = xdr_encode_fhandle(p, args->fh);
 	p = xdr_encode_array(p, args->name, args->len);
@@ -222,7 +222,7 @@ nfs_xdr_diropargs(struct rpc_rqst *req, u32 *p, struct nfs_diropargs *args)
  * exactly to the page we want to fetch.
  */
 static int
-nfs_xdr_readargs(struct rpc_rqst *req, u32 *p, struct nfs_readargs *args)
+nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
 	struct rpc_auth	*auth = req->rq_task->tk_auth;
 	unsigned int replen;
@@ -246,7 +246,7 @@ nfs_xdr_readargs(struct rpc_rqst *req, u32 *p, struct nfs_readargs *args)
  * Decode READ reply
  */
 static int
-nfs_xdr_readres(struct rpc_rqst *req, u32 *p, struct nfs_readres *res)
+nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 {
 	struct kvec *iov = req->rq_rcv_buf.head;
 	int	status, count, recvd, hdrlen;
@@ -286,7 +286,7 @@ nfs_xdr_readres(struct rpc_rqst *req, u32 *p, struct nfs_readres *res)
  * Write arguments. Splice the buffer to be written into the iovec.
  */
 static int
-nfs_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
+nfs_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
 {
 	struct xdr_buf *sndbuf = &req->rq_snd_buf;
 	u32 offset = (u32)args->offset;
@@ -309,7 +309,7 @@ nfs_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
  * CREATE, MKDIR
  */
 static int
-nfs_xdr_createargs(struct rpc_rqst *req, u32 *p, struct nfs_createargs *args)
+nfs_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs_createargs *args)
 {
 	p = xdr_encode_fhandle(p, args->fh);
 	p = xdr_encode_array(p, args->name, args->len);
@@ -322,7 +322,7 @@ nfs_xdr_createargs(struct rpc_rqst *req, u32 *p, struct nfs_createargs *args)
  * Encode RENAME arguments
  */
 static int
-nfs_xdr_renameargs(struct rpc_rqst *req, u32 *p, struct nfs_renameargs *args)
+nfs_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs_renameargs *args)
 {
 	p = xdr_encode_fhandle(p, args->fromfh);
 	p = xdr_encode_array(p, args->fromname, args->fromlen);
@@ -336,7 +336,7 @@ nfs_xdr_renameargs(struct rpc_rqst *req, u32 *p, struct nfs_renameargs *args)
  * Encode LINK arguments
  */
 static int
-nfs_xdr_linkargs(struct rpc_rqst *req, u32 *p, struct nfs_linkargs *args)
+nfs_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs_linkargs *args)
 {
 	p = xdr_encode_fhandle(p, args->fromfh);
 	p = xdr_encode_fhandle(p, args->tofh);
@@ -349,7 +349,7 @@ nfs_xdr_linkargs(struct rpc_rqst *req, u32 *p, struct nfs_linkargs *args)
  * Encode SYMLINK arguments
  */
 static int
-nfs_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_symlinkargs *args)
+nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *args)
 {
 	struct xdr_buf *sndbuf = &req->rq_snd_buf;
 	size_t pad;
@@ -378,7 +378,7 @@ nfs_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_symlinkargs *args)
  * Encode arguments to readdir call
  */
 static int
-nfs_xdr_readdirargs(struct rpc_rqst *req, u32 *p, struct nfs_readdirargs *args)
+nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
 {
 	struct rpc_task	*task = req->rq_task;
 	struct rpc_auth	*auth = task->tk_auth;
@@ -404,7 +404,7 @@ nfs_xdr_readdirargs(struct rpc_rqst *req, u32 *p, struct nfs_readdirargs *args)
  * from nfs_readdir for each entry.
  */
 static int
-nfs_xdr_readdirres(struct rpc_rqst *req, u32 *p, void *dummy)
+nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy)
 {
 	struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
 	struct kvec *iov = rcvbuf->head;
@@ -412,7 +412,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, u32 *p, void *dummy)
 	int hdrlen, recvd;
 	int status, nr;
 	unsigned int len, pglen;
-	u32 *end, *entry, *kaddr;
+	__be32 *end, *entry, *kaddr;
 
 	if ((status = ntohl(*p++)))
 		return -nfs_stat_to_errno(status);
@@ -432,8 +432,8 @@ nfs_xdr_readdirres(struct rpc_rqst *req, u32 *p, void *dummy)
 	if (pglen > recvd)
 		pglen = recvd;
 	page = rcvbuf->pages;
-	kaddr = p = (u32 *)kmap_atomic(*page, KM_USER0);
-	end = (u32 *)((char *)p + pglen);
+	kaddr = p = kmap_atomic(*page, KM_USER0);
+	end = (__be32 *)((char *)p + pglen);
 	entry = p;
 	for (nr = 0; *p++; nr++) {
 		if (p + 2 > end)
@@ -468,8 +468,8 @@ err_unmap:
 	goto out;
 }
 
-u32 *
-nfs_decode_dirent(u32 *p, struct nfs_entry *entry, int plus)
+__be32 *
+nfs_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
 {
 	if (!*p++) {
 		if (!*p)
@@ -496,7 +496,7 @@ nfs_decode_dirent(u32 *p, struct nfs_entry *entry, int plus)
  * Decode simple status reply
  */
 static int
-nfs_xdr_stat(struct rpc_rqst *req, u32 *p, void *dummy)
+nfs_xdr_stat(struct rpc_rqst *req, __be32 *p, void *dummy)
 {
 	int	status;
 
@@ -510,7 +510,7 @@ nfs_xdr_stat(struct rpc_rqst *req, u32 *p, void *dummy)
  * GETATTR, SETATTR, WRITE
  */
 static int
-nfs_xdr_attrstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
+nfs_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 {
 	int	status;
 
@@ -525,7 +525,7 @@ nfs_xdr_attrstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
  * LOOKUP, CREATE, MKDIR
  */
 static int
-nfs_xdr_diropres(struct rpc_rqst *req, u32 *p, struct nfs_diropok *res)
+nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
 {
 	int	status;
 
@@ -540,7 +540,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, u32 *p, struct nfs_diropok *res)
  * Encode READLINK args
  */
 static int
-nfs_xdr_readlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_readlinkargs *args)
+nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
 {
 	struct rpc_auth *auth = req->rq_task->tk_auth;
 	unsigned int replen;
@@ -558,7 +558,7 @@ nfs_xdr_readlinkargs(struct rpc_rqst *req, u32 *p, struct nfs_readlinkargs *args
  * Decode READLINK reply
  */
 static int
-nfs_xdr_readlinkres(struct rpc_rqst *req, u32 *p, void *dummy)
+nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy)
 {
 	struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
 	struct kvec *iov = rcvbuf->head;
@@ -601,7 +601,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, u32 *p, void *dummy)
  * Decode WRITE reply
  */
 static int
-nfs_xdr_writeres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
+nfs_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
 {
 	res->verf->committed = NFS_FILE_SYNC;
 	return nfs_xdr_attrstat(req, p, res->fattr);
@@ -611,7 +611,7 @@ nfs_xdr_writeres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
  * Decode STATFS reply
  */
 static int
-nfs_xdr_statfsres(struct rpc_rqst *req, u32 *p, struct nfs2_fsstat *res)
+nfs_xdr_statfsres(struct rpc_rqst *req, __be32 *p, struct nfs2_fsstat *res)
 {
 	int	status;
 
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index f8688eaa0001..e5f128ffc32d 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -449,7 +449,7 @@ nfs3_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir, struct qstr
 		struct nfs_fattr res;
 	} *ptr;
 
-	ptr = (struct unlinkxdr *)kmalloc(sizeof(*ptr), GFP_KERNEL);
+	ptr = kmalloc(sizeof(*ptr), GFP_KERNEL);
 	if (!ptr)
 		return -ENOMEM;
 	ptr->arg.fh = NFS_FH(dir->d_inode);
@@ -668,7 +668,7 @@ nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 {
 	struct inode		*dir = dentry->d_inode;
 	struct nfs_fattr	dir_attr;
-	u32			*verf = NFS_COOKIEVERF(dir);
+	__be32			*verf = NFS_COOKIEVERF(dir);
 	struct nfs3_readdirargs	arg = {
 		.fh		= NFS_FH(dir),
 		.cookie		= cookie,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 16556fa4effb..0ace092d126f 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -105,14 +105,14 @@ static struct {
 /*
  * Common NFS XDR functions as inlines
  */
-static inline u32 *
-xdr_encode_fhandle(u32 *p, struct nfs_fh *fh)
+static inline __be32 *
+xdr_encode_fhandle(__be32 *p, struct nfs_fh *fh)
 {
 	return xdr_encode_array(p, fh->data, fh->size);
 }
 
-static inline u32 *
-xdr_decode_fhandle(u32 *p, struct nfs_fh *fh)
+static inline __be32 *
+xdr_decode_fhandle(__be32 *p, struct nfs_fh *fh)
 {
 	if ((fh->size = ntohl(*p++)) <= NFS3_FHSIZE) {
 		memcpy(fh->data, p, fh->size);
@@ -124,24 +124,24 @@ xdr_decode_fhandle(u32 *p, struct nfs_fh *fh)
 /*
  * Encode/decode time.
  */
-static inline u32 *
-xdr_encode_time3(u32 *p, struct timespec *timep)
+static inline __be32 *
+xdr_encode_time3(__be32 *p, struct timespec *timep)
 {
 	*p++ = htonl(timep->tv_sec);
 	*p++ = htonl(timep->tv_nsec);
 	return p;
 }
 
-static inline u32 *
-xdr_decode_time3(u32 *p, struct timespec *timep)
+static inline __be32 *
+xdr_decode_time3(__be32 *p, struct timespec *timep)
 {
 	timep->tv_sec = ntohl(*p++);
 	timep->tv_nsec = ntohl(*p++);
 	return p;
 }
 
-static u32 *
-xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
+static __be32 *
+xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
 	unsigned int	type, major, minor;
 	int		fmode;
@@ -177,8 +177,8 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr)
 	return p;
 }
 
-static inline u32 *
-xdr_encode_sattr(u32 *p, struct iattr *attr)
+static inline __be32 *
+xdr_encode_sattr(__be32 *p, struct iattr *attr)
 {
 	if (attr->ia_valid & ATTR_MODE) {
 		*p++ = xdr_one;
@@ -223,8 +223,8 @@ xdr_encode_sattr(u32 *p, struct iattr *attr)
 	return p;
 }
 
-static inline u32 *
-xdr_decode_wcc_attr(u32 *p, struct nfs_fattr *fattr)
+static inline __be32 *
+xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
 {
 	p = xdr_decode_hyper(p, &fattr->pre_size);
 	p = xdr_decode_time3(p, &fattr->pre_mtime);
@@ -233,16 +233,16 @@ xdr_decode_wcc_attr(u32 *p, struct nfs_fattr *fattr)
 	return p;
 }
 
-static inline u32 *
-xdr_decode_post_op_attr(u32 *p, struct nfs_fattr *fattr)
+static inline __be32 *
+xdr_decode_post_op_attr(__be32 *p, struct nfs_fattr *fattr)
 {
 	if (*p++)
 		p = xdr_decode_fattr(p, fattr);
 	return p;
 }
 
-static inline u32 *
-xdr_decode_pre_op_attr(u32 *p, struct nfs_fattr *fattr)
+static inline __be32 *
+xdr_decode_pre_op_attr(__be32 *p, struct nfs_fattr *fattr)
 {
 	if (*p++)
 		return xdr_decode_wcc_attr(p, fattr);
@@ -250,8 +250,8 @@ xdr_decode_pre_op_attr(u32 *p, struct nfs_fattr *fattr)
 }
 
 
-static inline u32 *
-xdr_decode_wcc_data(u32 *p, struct nfs_fattr *fattr)
+static inline __be32 *
+xdr_decode_wcc_data(__be32 *p, struct nfs_fattr *fattr)
 {
 	p = xdr_decode_pre_op_attr(p, fattr);
 	return xdr_decode_post_op_attr(p, fattr);
@@ -265,7 +265,7 @@ xdr_decode_wcc_data(u32 *p, struct nfs_fattr *fattr)
  * Encode file handle argument
  */
 static int
-nfs3_xdr_fhandle(struct rpc_rqst *req, u32 *p, struct nfs_fh *fh)
+nfs3_xdr_fhandle(struct rpc_rqst *req, __be32 *p, struct nfs_fh *fh)
 {
 	p = xdr_encode_fhandle(p, fh);
 	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
@@ -276,7 +276,7 @@ nfs3_xdr_fhandle(struct rpc_rqst *req, u32 *p, struct nfs_fh *fh)
  * Encode SETATTR arguments
  */
 static int
-nfs3_xdr_sattrargs(struct rpc_rqst *req, u32 *p, struct nfs3_sattrargs *args)
+nfs3_xdr_sattrargs(struct rpc_rqst *req, __be32 *p, struct nfs3_sattrargs *args)
 {
 	p = xdr_encode_fhandle(p, args->fh);
 	p = xdr_encode_sattr(p, args->sattr);
@@ -291,7 +291,7 @@ nfs3_xdr_sattrargs(struct rpc_rqst *req, u32 *p, struct nfs3_sattrargs *args)
  * Encode directory ops argument
  */
 static int
-nfs3_xdr_diropargs(struct rpc_rqst *req, u32 *p, struct nfs3_diropargs *args)
+nfs3_xdr_diropargs(struct rpc_rqst *req, __be32 *p, struct nfs3_diropargs *args)
 {
 	p = xdr_encode_fhandle(p, args->fh);
 	p = xdr_encode_array(p, args->name, args->len);
@@ -303,7 +303,7 @@ nfs3_xdr_diropargs(struct rpc_rqst *req, u32 *p, struct nfs3_diropargs *args)
  * Encode access() argument
  */
 static int
-nfs3_xdr_accessargs(struct rpc_rqst *req, u32 *p, struct nfs3_accessargs *args)
+nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *args)
 {
 	p = xdr_encode_fhandle(p, args->fh);
 	*p++ = htonl(args->access);
@@ -317,7 +317,7 @@ nfs3_xdr_accessargs(struct rpc_rqst *req, u32 *p, struct nfs3_accessargs *args)
  * exactly to the page we want to fetch.
  */
 static int
-nfs3_xdr_readargs(struct rpc_rqst *req, u32 *p, struct nfs_readargs *args)
+nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
 	struct rpc_auth	*auth = req->rq_task->tk_auth;
 	unsigned int replen;
@@ -339,7 +339,7 @@ nfs3_xdr_readargs(struct rpc_rqst *req, u32 *p, struct nfs_readargs *args)
  * Write arguments. Splice the buffer to be written into the iovec.
  */
 static int
-nfs3_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
+nfs3_xdr_writeargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
 {
 	struct xdr_buf *sndbuf = &req->rq_snd_buf;
 	u32 count = args->count;
@@ -360,7 +360,7 @@ nfs3_xdr_writeargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
  * Encode CREATE arguments
  */
 static int
-nfs3_xdr_createargs(struct rpc_rqst *req, u32 *p, struct nfs3_createargs *args)
+nfs3_xdr_createargs(struct rpc_rqst *req, __be32 *p, struct nfs3_createargs *args)
 {
 	p = xdr_encode_fhandle(p, args->fh);
 	p = xdr_encode_array(p, args->name, args->len);
@@ -380,7 +380,7 @@ nfs3_xdr_createargs(struct rpc_rqst *req, u32 *p, struct nfs3_createargs *args)
  * Encode MKDIR arguments
  */
 static int
-nfs3_xdr_mkdirargs(struct rpc_rqst *req, u32 *p, struct nfs3_mkdirargs *args)
+nfs3_xdr_mkdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mkdirargs *args)
 {
 	p = xdr_encode_fhandle(p, args->fh);
 	p = xdr_encode_array(p, args->name, args->len);
@@ -393,7 +393,7 @@ nfs3_xdr_mkdirargs(struct rpc_rqst *req, u32 *p, struct nfs3_mkdirargs *args)
  * Encode SYMLINK arguments
  */
 static int
-nfs3_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs3_symlinkargs *args)
+nfs3_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_symlinkargs *args)
 {
 	p = xdr_encode_fhandle(p, args->fromfh);
 	p = xdr_encode_array(p, args->fromname, args->fromlen);
@@ -410,7 +410,7 @@ nfs3_xdr_symlinkargs(struct rpc_rqst *req, u32 *p, struct nfs3_symlinkargs *args
  * Encode MKNOD arguments
  */
 static int
-nfs3_xdr_mknodargs(struct rpc_rqst *req, u32 *p, struct nfs3_mknodargs *args)
+nfs3_xdr_mknodargs(struct rpc_rqst *req, __be32 *p, struct nfs3_mknodargs *args)
 {
 	p = xdr_encode_fhandle(p, args->fh);
 	p = xdr_encode_array(p, args->name, args->len);
@@ -429,7 +429,7 @@ nfs3_xdr_mknodargs(struct rpc_rqst *req, u32 *p, struct nfs3_mknodargs *args)
  * Encode RENAME arguments
  */
 static int
-nfs3_xdr_renameargs(struct rpc_rqst *req, u32 *p, struct nfs3_renameargs *args)
+nfs3_xdr_renameargs(struct rpc_rqst *req, __be32 *p, struct nfs3_renameargs *args)
 {
 	p = xdr_encode_fhandle(p, args->fromfh);
 	p = xdr_encode_array(p, args->fromname, args->fromlen);
@@ -443,7 +443,7 @@ nfs3_xdr_renameargs(struct rpc_rqst *req, u32 *p, struct nfs3_renameargs *args)
  * Encode LINK arguments
  */
 static int
-nfs3_xdr_linkargs(struct rpc_rqst *req, u32 *p, struct nfs3_linkargs *args)
+nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
 {
 	p = xdr_encode_fhandle(p, args->fromfh);
 	p = xdr_encode_fhandle(p, args->tofh);
@@ -456,7 +456,7 @@ nfs3_xdr_linkargs(struct rpc_rqst *req, u32 *p, struct nfs3_linkargs *args)
  * Encode arguments to readdir call
  */
 static int
-nfs3_xdr_readdirargs(struct rpc_rqst *req, u32 *p, struct nfs3_readdirargs *args)
+nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
 {
 	struct rpc_auth	*auth = req->rq_task->tk_auth;
 	unsigned int replen;
@@ -485,7 +485,7 @@ nfs3_xdr_readdirargs(struct rpc_rqst *req, u32 *p, struct nfs3_readdirargs *args
  * We just check for syntactical correctness.
  */
 static int
-nfs3_xdr_readdirres(struct rpc_rqst *req, u32 *p, struct nfs3_readdirres *res)
+nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res)
 {
 	struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
 	struct kvec *iov = rcvbuf->head;
@@ -493,7 +493,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, u32 *p, struct nfs3_readdirres *res)
 	int hdrlen, recvd;
 	int status, nr;
 	unsigned int len, pglen;
-	u32 *entry, *end, *kaddr;
+	__be32 *entry, *end, *kaddr;
 
 	status = ntohl(*p++);
 	/* Decode post_op_attrs */
@@ -523,8 +523,8 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, u32 *p, struct nfs3_readdirres *res)
 	if (pglen > recvd)
 		pglen = recvd;
 	page = rcvbuf->pages;
-	kaddr = p = (u32 *)kmap_atomic(*page, KM_USER0);
-	end = (u32 *)((char *)p + pglen);
+	kaddr = p = kmap_atomic(*page, KM_USER0);
+	end = (__be32 *)((char *)p + pglen);
 	entry = p;
 	for (nr = 0; *p++; nr++) {
 		if (p + 3 > end)
@@ -583,8 +583,8 @@ err_unmap:
 	goto out;
 }
 
-u32 *
-nfs3_decode_dirent(u32 *p, struct nfs_entry *entry, int plus)
+__be32 *
+nfs3_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
 {
 	struct nfs_entry old = *entry;
 
@@ -626,7 +626,7 @@ nfs3_decode_dirent(u32 *p, struct nfs_entry *entry, int plus)
  * Encode COMMIT arguments
  */
 static int
-nfs3_xdr_commitargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
+nfs3_xdr_commitargs(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
 {
 	p = xdr_encode_fhandle(p, args->fh);
 	p = xdr_encode_hyper(p, args->offset);
@@ -640,7 +640,7 @@ nfs3_xdr_commitargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
  * Encode GETACL arguments
  */
 static int
-nfs3_xdr_getaclargs(struct rpc_rqst *req, u32 *p,
+nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
 		    struct nfs3_getaclargs *args)
 {
 	struct rpc_auth *auth = req->rq_task->tk_auth;
@@ -664,7 +664,7 @@ nfs3_xdr_getaclargs(struct rpc_rqst *req, u32 *p,
  * Encode SETACL arguments
  */
 static int
-nfs3_xdr_setaclargs(struct rpc_rqst *req, u32 *p,
+nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
                    struct nfs3_setaclargs *args)
 {
 	struct xdr_buf *buf = &req->rq_snd_buf;
@@ -711,7 +711,7 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, u32 *p,
  * Decode attrstat reply.
  */
 static int
-nfs3_xdr_attrstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
+nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 {
 	int	status;
 
@@ -726,7 +726,7 @@ nfs3_xdr_attrstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
  * SATTR, REMOVE, RMDIR
  */
 static int
-nfs3_xdr_wccstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
+nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 {
 	int	status;
 
@@ -740,7 +740,7 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
  * Decode LOOKUP reply
  */
 static int
-nfs3_xdr_lookupres(struct rpc_rqst *req, u32 *p, struct nfs3_diropres *res)
+nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
 {
 	int	status;
 
@@ -759,7 +759,7 @@ nfs3_xdr_lookupres(struct rpc_rqst *req, u32 *p, struct nfs3_diropres *res)
  * Decode ACCESS reply
  */
 static int
-nfs3_xdr_accessres(struct rpc_rqst *req, u32 *p, struct nfs3_accessres *res)
+nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
 {
 	int	status = ntohl(*p++);
 
@@ -771,7 +771,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, u32 *p, struct nfs3_accessres *res)
 }
 
 static int
-nfs3_xdr_readlinkargs(struct rpc_rqst *req, u32 *p, struct nfs3_readlinkargs *args)
+nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
 {
 	struct rpc_auth *auth = req->rq_task->tk_auth;
 	unsigned int replen;
@@ -789,7 +789,7 @@ nfs3_xdr_readlinkargs(struct rpc_rqst *req, u32 *p, struct nfs3_readlinkargs *ar
  * Decode READLINK reply
  */
 static int
-nfs3_xdr_readlinkres(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
+nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 {
 	struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
 	struct kvec *iov = rcvbuf->head;
@@ -837,7 +837,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
  * Decode READ reply
  */
 static int
-nfs3_xdr_readres(struct rpc_rqst *req, u32 *p, struct nfs_readres *res)
+nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res)
 {
 	struct kvec *iov = req->rq_rcv_buf.head;
 	int	status, count, ocount, recvd, hdrlen;
@@ -888,7 +888,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, u32 *p, struct nfs_readres *res)
  * Decode WRITE response
  */
 static int
-nfs3_xdr_writeres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
+nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
 {
 	int	status;
 
@@ -910,7 +910,7 @@ nfs3_xdr_writeres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
  * Decode a CREATE response
  */
 static int
-nfs3_xdr_createres(struct rpc_rqst *req, u32 *p, struct nfs3_diropres *res)
+nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res)
 {
 	int	status;
 
@@ -937,7 +937,7 @@ nfs3_xdr_createres(struct rpc_rqst *req, u32 *p, struct nfs3_diropres *res)
  * Decode RENAME reply
  */
 static int
-nfs3_xdr_renameres(struct rpc_rqst *req, u32 *p, struct nfs3_renameres *res)
+nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs3_renameres *res)
 {
 	int	status;
 
@@ -952,7 +952,7 @@ nfs3_xdr_renameres(struct rpc_rqst *req, u32 *p, struct nfs3_renameres *res)
  * Decode LINK reply
  */
 static int
-nfs3_xdr_linkres(struct rpc_rqst *req, u32 *p, struct nfs3_linkres *res)
+nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res)
 {
 	int	status;
 
@@ -967,7 +967,7 @@ nfs3_xdr_linkres(struct rpc_rqst *req, u32 *p, struct nfs3_linkres *res)
  * Decode FSSTAT reply
  */
 static int
-nfs3_xdr_fsstatres(struct rpc_rqst *req, u32 *p, struct nfs_fsstat *res)
+nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res)
 {
 	int		status;
 
@@ -992,7 +992,7 @@ nfs3_xdr_fsstatres(struct rpc_rqst *req, u32 *p, struct nfs_fsstat *res)
  * Decode FSINFO reply
  */
 static int
-nfs3_xdr_fsinfores(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res)
+nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res)
 {
 	int		status;
 
@@ -1020,7 +1020,7 @@ nfs3_xdr_fsinfores(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res)
  * Decode PATHCONF reply
  */
 static int
-nfs3_xdr_pathconfres(struct rpc_rqst *req, u32 *p, struct nfs_pathconf *res)
+nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res)
 {
 	int		status;
 
@@ -1040,7 +1040,7 @@ nfs3_xdr_pathconfres(struct rpc_rqst *req, u32 *p, struct nfs_pathconf *res)
  * Decode COMMIT reply
  */
 static int
-nfs3_xdr_commitres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
+nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res)
 {
 	int		status;
 
@@ -1059,7 +1059,7 @@ nfs3_xdr_commitres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
  * Decode GETACL reply
  */
 static int
-nfs3_xdr_getaclres(struct rpc_rqst *req, u32 *p,
+nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p,
 		   struct nfs3_getaclres *res)
 {
 	struct xdr_buf *buf = &req->rq_rcv_buf;
@@ -1091,7 +1091,7 @@ nfs3_xdr_getaclres(struct rpc_rqst *req, u32 *p,
  * Decode setacl reply.
  */
 static int
-nfs3_xdr_setaclres(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
+nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr)
 {
 	int status = ntohl(*p++);
 
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 61095fe4b5ca..6f346677332d 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -212,7 +212,7 @@ extern void nfs_free_seqid(struct nfs_seqid *seqid);
 extern const nfs4_stateid zero_stateid;
 
 /* nfs4xdr.c */
-extern uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus);
+extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
 extern struct rpc_procinfo nfs4_procedures[];
 
 struct nfs4_mount_data;
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 24e47f3bbd17..b872779d7cd5 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -7,8 +7,6 @@
  * NFSv4 namespace
  */
 
-#include <linux/config.h>
-
 #include <linux/dcache.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 47c7e6e3910d..8118036cc449 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -138,10 +138,10 @@ const u32 nfs4_fs_locations_bitmap[2] = {
 	| FATTR4_WORD1_MOUNTED_ON_FILEID
 };
 
-static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry,
+static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry,
 		struct nfs4_readdir_arg *readdir)
 {
-	u32 *start, *p;
+	__be32 *start, *p;
 
 	BUG_ON(readdir->count < 80);
 	if (cookie > 2) {
@@ -162,7 +162,7 @@ static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry,
 	 * when talking to the server, we always send cookie 0
 	 * instead of 1 or 2.
 	 */
-	start = p = (u32 *)kmap_atomic(*readdir->pages, KM_USER0);
+	start = p = kmap_atomic(*readdir->pages, KM_USER0);
 	
 	if (cookie == 0) {
 		*p++ = xdr_one;                                  /* next */
@@ -1314,11 +1314,9 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
 			case -EROFS:
 				lookup_instantiate_filp(nd, (struct dentry *)state, NULL);
 				return 1;
-			case -ENOENT:
-				if (dentry->d_inode == NULL)
-					return 1;
+			default:
+				goto out_drop;
 		}
-		goto out_drop;
 	}
 	if (state->inode == dentry->d_inode) {
 		nfs4_intent_set_file(nd, dentry, state);
@@ -2917,11 +2915,11 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po
 		.rpc_resp = clp,
 		.rpc_cred = cred,
 	};
-	u32 *p;
+	__be32 *p;
 	int loop = 0;
 	int status;
 
-	p = (u32*)sc_verifier.data;
+	p = (__be32*)sc_verifier.data;
 	*p++ = htonl((u32)clp->cl_boot_time.tv_sec);
 	*p = htonl((u32)clp->cl_boot_time.tv_nsec);
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 3dd413f52da1..0cf3fa312a33 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -471,7 +471,7 @@ struct compound_hdr {
 
 static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	p = xdr_reserve_space(xdr, 4 + len);
 	BUG_ON(p == NULL);
@@ -480,7 +480,7 @@ static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *
 
 static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
 	BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
@@ -494,7 +494,7 @@ static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 
 static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
 	BUG_ON(p == NULL);
@@ -507,8 +507,8 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
 	char owner_group[IDMAP_NAMESZ];
 	int owner_namelen = 0;
 	int owner_grouplen = 0;
-	uint32_t *p;
-	uint32_t *q;
+	__be32 *p;
+	__be32 *q;
 	int len;
 	uint32_t bmval0 = 0;
 	uint32_t bmval1 = 0;
@@ -630,7 +630,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
 
 static int encode_access(struct xdr_stream *xdr, u32 access)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(8);
 	WRITE32(OP_ACCESS);
@@ -641,7 +641,7 @@ static int encode_access(struct xdr_stream *xdr, u32 access)
 
 static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(8+sizeof(arg->stateid->data));
 	WRITE32(OP_CLOSE);
@@ -653,7 +653,7 @@ static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
 
 static int encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args)
 {
-	uint32_t *p;
+	__be32 *p;
         
         RESERVE_SPACE(16);
         WRITE32(OP_COMMIT);
@@ -665,7 +665,7 @@ static int encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 
 static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create)
 {
-	uint32_t *p;
+	__be32 *p;
 	
 	RESERVE_SPACE(8);
 	WRITE32(OP_CREATE);
@@ -697,7 +697,7 @@ static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *c
 
 static int encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap)
 {
-        uint32_t *p;
+        __be32 *p;
 
         RESERVE_SPACE(12);
         WRITE32(OP_GETATTR);
@@ -708,7 +708,7 @@ static int encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap)
 
 static int encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1)
 {
-        uint32_t *p;
+        __be32 *p;
 
         RESERVE_SPACE(16);
         WRITE32(OP_GETATTR);
@@ -740,7 +740,7 @@ static int encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask)
 
 static int encode_getfh(struct xdr_stream *xdr)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(4);
 	WRITE32(OP_GETFH);
@@ -750,7 +750,7 @@ static int encode_getfh(struct xdr_stream *xdr)
 
 static int encode_link(struct xdr_stream *xdr, const struct qstr *name)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(8 + name->len);
 	WRITE32(OP_LINK);
@@ -780,7 +780,7 @@ static inline uint64_t nfs4_lock_length(struct file_lock *fl)
  */
 static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(32);
 	WRITE32(OP_LOCK);
@@ -809,7 +809,7 @@ static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
 
 static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(40);
 	WRITE32(OP_LOCKT);
@@ -825,7 +825,7 @@ static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *arg
 
 static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(44);
 	WRITE32(OP_LOCKU);
@@ -841,7 +841,7 @@ static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *arg
 static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name)
 {
 	int len = name->len;
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(8 + len);
 	WRITE32(OP_LOOKUP);
@@ -853,7 +853,7 @@ static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name)
 
 static void encode_share_access(struct xdr_stream *xdr, int open_flags)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(8);
 	switch (open_flags & (FMODE_READ|FMODE_WRITE)) {
@@ -874,7 +874,7 @@ static void encode_share_access(struct xdr_stream *xdr, int open_flags)
 
 static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
 {
-	uint32_t *p;
+	__be32 *p;
  /*
  * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
  * owner 4 = 32
@@ -891,7 +891,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(4);
 	switch(arg->open_flags & O_EXCL) {
@@ -907,7 +907,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
 
 static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *arg)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(4);
 	switch (arg->open_flags & O_CREAT) {
@@ -923,7 +923,7 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
 
 static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation_type)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(4);
 	switch (delegation_type) {
@@ -943,7 +943,7 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation
 
 static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *name)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(4);
 	WRITE32(NFS4_OPEN_CLAIM_NULL);
@@ -952,7 +952,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
 
 static inline void encode_claim_previous(struct xdr_stream *xdr, int type)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(4);
 	WRITE32(NFS4_OPEN_CLAIM_PREVIOUS);
@@ -961,7 +961,7 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, int type)
 
 static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struct qstr *name, const nfs4_stateid *stateid)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(4+sizeof(stateid->data));
 	WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
@@ -991,7 +991,7 @@ static int encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg)
 
 static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(8+sizeof(arg->stateid->data));
 	WRITE32(OP_OPEN_CONFIRM);
@@ -1003,7 +1003,7 @@ static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_con
 
 static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(8+sizeof(arg->stateid->data));
 	WRITE32(OP_OPEN_DOWNGRADE);
@@ -1017,7 +1017,7 @@ static int
 encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh)
 {
 	int len = fh->size;
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(8 + len);
 	WRITE32(OP_PUTFH);
@@ -1029,7 +1029,7 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh)
 
 static int encode_putrootfh(struct xdr_stream *xdr)
 {
-        uint32_t *p;
+        __be32 *p;
         
         RESERVE_SPACE(4);
         WRITE32(OP_PUTROOTFH);
@@ -1040,7 +1040,7 @@ static int encode_putrootfh(struct xdr_stream *xdr)
 static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
 {
 	nfs4_stateid stateid;
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(16);
 	if (ctx->state != NULL) {
@@ -1052,7 +1052,7 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
 
 static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(4);
 	WRITE32(OP_READ);
@@ -1074,7 +1074,7 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
 		FATTR4_WORD1_MOUNTED_ON_FILEID,
 	};
 	int replen;
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(32+sizeof(nfs4_verifier));
 	WRITE32(OP_READDIR);
@@ -1116,7 +1116,7 @@ static int encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *r
 {
 	struct rpc_auth *auth = req->rq_task->tk_auth;
 	unsigned int replen;
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(4);
 	WRITE32(OP_READLINK);
@@ -1134,7 +1134,7 @@ static int encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *r
 
 static int encode_remove(struct xdr_stream *xdr, const struct qstr *name)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(8 + name->len);
 	WRITE32(OP_REMOVE);
@@ -1146,7 +1146,7 @@ static int encode_remove(struct xdr_stream *xdr, const struct qstr *name)
 
 static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(8 + oldname->len);
 	WRITE32(OP_RENAME);
@@ -1162,7 +1162,7 @@ static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, con
 
 static int encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(12);
 	WRITE32(OP_RENEW);
@@ -1174,7 +1174,7 @@ static int encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_
 static int
 encode_restorefh(struct xdr_stream *xdr)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(4);
 	WRITE32(OP_RESTOREFH);
@@ -1185,7 +1185,7 @@ encode_restorefh(struct xdr_stream *xdr)
 static int
 encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(4+sizeof(zero_stateid.data));
 	WRITE32(OP_SETATTR);
@@ -1204,7 +1204,7 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
 static int
 encode_savefh(struct xdr_stream *xdr)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(4);
 	WRITE32(OP_SAVEFH);
@@ -1215,7 +1215,7 @@ encode_savefh(struct xdr_stream *xdr)
 static int encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server)
 {
 	int status;
-	uint32_t *p;
+	__be32 *p;
 	
         RESERVE_SPACE(4+sizeof(arg->stateid.data));
         WRITE32(OP_SETATTR);
@@ -1229,7 +1229,7 @@ static int encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *
 
 static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(4 + sizeof(setclientid->sc_verifier->data));
 	WRITE32(OP_SETCLIENTID);
@@ -1248,7 +1248,7 @@ static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclien
 
 static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state)
 {
-        uint32_t *p;
+        __be32 *p;
 
         RESERVE_SPACE(12 + sizeof(client_state->cl_confirm.data));
         WRITE32(OP_SETCLIENTID_CONFIRM);
@@ -1260,7 +1260,7 @@ static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_c
 
 static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(4);
 	WRITE32(OP_WRITE);
@@ -1279,7 +1279,7 @@ static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args
 
 static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	RESERVE_SPACE(20);
 
@@ -1295,7 +1295,7 @@ static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *statei
 /*
  * Encode an ACCESS request
  */
-static int nfs4_xdr_enc_access(struct rpc_rqst *req, uint32_t *p, const struct nfs4_accessargs *args)
+static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs4_accessargs *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1313,7 +1313,7 @@ static int nfs4_xdr_enc_access(struct rpc_rqst *req, uint32_t *p, const struct n
 /*
  * Encode LOOKUP request
  */
-static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, uint32_t *p, const struct nfs4_lookup_arg *args)
+static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_arg *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1337,7 +1337,7 @@ out:
 /*
  * Encode LOOKUP_ROOT request
  */
-static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, uint32_t *p, const struct nfs4_lookup_root_arg *args)
+static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struct nfs4_lookup_root_arg *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1358,7 +1358,7 @@ out:
 /*
  * Encode REMOVE request
  */
-static int nfs4_xdr_enc_remove(struct rpc_rqst *req, uint32_t *p, const struct nfs4_remove_arg *args)
+static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs4_remove_arg *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1380,7 +1380,7 @@ out:
 /*
  * Encode RENAME request
  */
-static int nfs4_xdr_enc_rename(struct rpc_rqst *req, uint32_t *p, const struct nfs4_rename_arg *args)
+static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs4_rename_arg *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1410,7 +1410,7 @@ out:
 /*
  * Encode LINK request
  */
-static int nfs4_xdr_enc_link(struct rpc_rqst *req, uint32_t *p, const struct nfs4_link_arg *args)
+static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_link_arg *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1440,7 +1440,7 @@ out:
 /*
  * Encode CREATE request
  */
-static int nfs4_xdr_enc_create(struct rpc_rqst *req, uint32_t *p, const struct nfs4_create_arg *args)
+static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1470,7 +1470,7 @@ out:
 /*
  * Encode SYMLINK request
  */
-static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, uint32_t *p, const struct nfs4_create_arg *args)
+static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_create_arg *args)
 {
 	return nfs4_xdr_enc_create(req, p, args);
 }
@@ -1478,7 +1478,7 @@ static int nfs4_xdr_enc_symlink(struct rpc_rqst *req, uint32_t *p, const struct
 /*
  * Encode GETATTR request
  */
-static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, uint32_t *p, const struct nfs4_getattr_arg *args)
+static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nfs4_getattr_arg *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1496,7 +1496,7 @@ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, uint32_t *p, const struct
 /*
  * Encode a CLOSE request
  */
-static int nfs4_xdr_enc_close(struct rpc_rqst *req, uint32_t *p, struct nfs_closeargs *args)
+static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
 {
         struct xdr_stream xdr;
         struct compound_hdr hdr = {
@@ -1520,7 +1520,7 @@ out:
 /*
  * Encode an OPEN request
  */
-static int nfs4_xdr_enc_open(struct rpc_rqst *req, uint32_t *p, struct nfs_openargs *args)
+static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1556,7 +1556,7 @@ out:
 /*
  * Encode an OPEN_CONFIRM request
  */
-static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, uint32_t *p, struct nfs_open_confirmargs *args)
+static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_open_confirmargs *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1577,7 +1577,7 @@ out:
 /*
  * Encode an OPEN request with no attributes.
  */
-static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, uint32_t *p, struct nfs_openargs *args)
+static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_openargs *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1601,7 +1601,7 @@ out:
 /*
  * Encode an OPEN_DOWNGRADE request
  */
-static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, uint32_t *p, struct nfs_closeargs *args)
+static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1625,7 +1625,7 @@ out:
 /*
  * Encode a LOCK request
  */
-static int nfs4_xdr_enc_lock(struct rpc_rqst *req, uint32_t *p, struct nfs_lock_args *args)
+static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_args *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1646,7 +1646,7 @@ out:
 /*
  * Encode a LOCKT request
  */
-static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, uint32_t *p, struct nfs_lockt_args *args)
+static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_args *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1667,7 +1667,7 @@ out:
 /*
  * Encode a LOCKU request
  */
-static int nfs4_xdr_enc_locku(struct rpc_rqst *req, uint32_t *p, struct nfs_locku_args *args)
+static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_args *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1688,7 +1688,7 @@ out:
 /*
  * Encode a READLINK request
  */
-static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, uint32_t *p, const struct nfs4_readlink *args)
+static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct nfs4_readlink *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1709,7 +1709,7 @@ out:
 /*
  * Encode a READDIR request
  */
-static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, uint32_t *p, const struct nfs4_readdir_arg *args)
+static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nfs4_readdir_arg *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1730,7 +1730,7 @@ out:
 /*
  * Encode a READ request
  */
-static int nfs4_xdr_enc_read(struct rpc_rqst *req, uint32_t *p, struct nfs_readargs *args)
+static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
 {
 	struct rpc_auth	*auth = req->rq_task->tk_auth;
 	struct xdr_stream xdr;
@@ -1762,7 +1762,7 @@ out:
 /*
  * Encode an SETATTR request
  */
-static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, uint32_t *p, struct nfs_setattrargs *args)
+static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args)
 
 {
         struct xdr_stream xdr;
@@ -1788,7 +1788,7 @@ out:
  * Encode a GETACL request
  */
 static int
-nfs4_xdr_enc_getacl(struct rpc_rqst *req, uint32_t *p,
+nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
 		struct nfs_getaclargs *args)
 {
 	struct xdr_stream xdr;
@@ -1815,7 +1815,7 @@ out:
 /*
  * Encode a WRITE request
  */
-static int nfs4_xdr_enc_write(struct rpc_rqst *req, uint32_t *p, struct nfs_writeargs *args)
+static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1839,7 +1839,7 @@ out:
 /*
  *  a COMMIT request
  */
-static int nfs4_xdr_enc_commit(struct rpc_rqst *req, uint32_t *p, struct nfs_writeargs *args)
+static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_writeargs *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1863,7 +1863,7 @@ out:
 /*
  * FSINFO request
  */
-static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs4_fsinfo_arg *args)
+static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsinfo_arg *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1882,7 +1882,7 @@ static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs
 /*
  * a PATHCONF request
  */
-static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, uint32_t *p, const struct nfs4_pathconf_arg *args)
+static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct nfs4_pathconf_arg *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1902,7 +1902,7 @@ static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, uint32_t *p, const struct
 /*
  * a STATFS request
  */
-static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, uint32_t *p, const struct nfs4_statfs_arg *args)
+static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs4_statfs_arg *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1923,7 +1923,7 @@ static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, uint32_t *p, const struct n
 /*
  * GETATTR_BITMAP request
  */
-static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, uint32_t *p, const struct nfs_fh *fhandle)
+static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, const struct nfs_fh *fhandle)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1945,7 +1945,7 @@ static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, uint32_t *p, const str
 /*
  * a RENEW request
  */
-static int nfs4_xdr_enc_renew(struct rpc_rqst *req, uint32_t *p, struct nfs_client *clp)
+static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1960,7 +1960,7 @@ static int nfs4_xdr_enc_renew(struct rpc_rqst *req, uint32_t *p, struct nfs_clie
 /*
  * a SETCLIENTID request
  */
-static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, uint32_t *p, struct nfs4_setclientid *sc)
+static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4_setclientid *sc)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1975,7 +1975,7 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, uint32_t *p, struct nf
 /*
  * a SETCLIENTID_CONFIRM request
  */
-static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, struct nfs_client *clp)
+static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_client *clp)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -1997,7 +1997,7 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, s
 /*
  * DELEGRETURN request
  */
-static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, uint32_t *p, const struct nfs4_delegreturnargs *args)
+static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struct nfs4_delegreturnargs *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -2021,7 +2021,7 @@ out:
 /*
  * Encode FS_LOCATIONS request
  */
-static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations_arg *args)
+static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations_arg *args)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
@@ -2086,7 +2086,7 @@ out:
 
 static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	READ_BUF(4);
 	READ32(*len);
@@ -2097,7 +2097,7 @@ static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char
 
 static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	READ_BUF(8);
 	READ32(hdr->status);
@@ -2112,7 +2112,7 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 
 static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 {
-	uint32_t *p;
+	__be32 *p;
 	uint32_t opnum;
 	int32_t nfserr;
 
@@ -2134,7 +2134,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 /* Dummy routine */
 static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
 {
-	uint32_t *p;
+	__be32 *p;
 	unsigned int strlen;
 	char *str;
 
@@ -2144,7 +2144,8 @@ static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
 
 static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
 {
-	uint32_t bmlen, *p;
+	uint32_t bmlen;
+	__be32 *p;
 
 	READ_BUF(4);
 	READ32(bmlen);
@@ -2159,9 +2160,9 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
 	return 0;
 }
 
-static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, uint32_t **savep)
+static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	READ_BUF(4);
 	READ32(*attrlen);
@@ -2182,7 +2183,7 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
 
 static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	*type = 0;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
@@ -2202,7 +2203,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 
 static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	*change = 0;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
@@ -2219,7 +2220,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 
 static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	*size = 0;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
@@ -2235,7 +2236,7 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
 
 static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	*res = 0;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U)))
@@ -2251,7 +2252,7 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
 
 static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	*res = 0;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U)))
@@ -2267,7 +2268,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 
 static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	fsid->major = 0;
 	fsid->minor = 0;
@@ -2287,7 +2288,7 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
 
 static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	*res = 60;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U)))
@@ -2303,7 +2304,7 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
 
 static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	*res = ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
@@ -2319,7 +2320,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 
 static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	*fileid = 0;
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
@@ -2335,7 +2336,7 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 
 static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	*fileid = 0;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
@@ -2351,7 +2352,7 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
 
 static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
-	uint32_t *p;
+	__be32 *p;
 	int status = 0;
 
 	*res = 0;
@@ -2368,7 +2369,7 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 
 static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
-	uint32_t *p;
+	__be32 *p;
 	int status = 0;
 
 	*res = 0;
@@ -2385,7 +2386,7 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 
 static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
-	uint32_t *p;
+	__be32 *p;
 	int status = 0;
 
 	*res = 0;
@@ -2403,7 +2404,7 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
 {
 	int n;
-	uint32_t *p;
+	__be32 *p;
 	int status = 0;
 
 	READ_BUF(4);
@@ -2448,7 +2449,7 @@ out_eio:
 static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
 {
 	int n;
-	uint32_t *p;
+	__be32 *p;
 	int status = -EIO;
 
 	if (unlikely(bitmap[0] & (FATTR4_WORD0_FS_LOCATIONS -1U)))
@@ -2512,7 +2513,7 @@ out_eio:
 
 static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
-	uint32_t *p;
+	__be32 *p;
 	int status = 0;
 
 	*res = 0;
@@ -2529,7 +2530,7 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
 
 static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
 {
-	uint32_t *p;
+	__be32 *p;
 	int status = 0;
 
 	*maxlink = 1;
@@ -2546,7 +2547,7 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 
 static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
 {
-	uint32_t *p;
+	__be32 *p;
 	int status = 0;
 
 	*maxname = 1024;
@@ -2563,7 +2564,7 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 
 static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
 {
-	uint32_t *p;
+	__be32 *p;
 	int status = 0;
 
 	*res = 1024;
@@ -2584,7 +2585,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 
 static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
 {
-	uint32_t *p;
+	__be32 *p;
 	int status = 0;
 
 	*res = 1024;
@@ -2605,7 +2606,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 
 static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	*mode = 0;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
@@ -2622,7 +2623,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 
 static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	*nlink = 1;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
@@ -2638,7 +2639,8 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, int32_t *uid)
 {
-	uint32_t len, *p;
+	uint32_t len;
+	__be32 *p;
 
 	*uid = -2;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
@@ -2662,7 +2664,8 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, int32_t *gid)
 {
-	uint32_t len, *p;
+	uint32_t len;
+	__be32 *p;
 
 	*gid = -2;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
@@ -2686,7 +2689,8 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 
 static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
 {
-	uint32_t major = 0, minor = 0, *p;
+	uint32_t major = 0, minor = 0;
+	__be32 *p;
 
 	*rdev = MKDEV(0,0);
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U)))
@@ -2708,7 +2712,7 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
 
 static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
-	uint32_t *p;
+	__be32 *p;
 	int status = 0;
 
 	*res = 0;
@@ -2725,7 +2729,7 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 
 static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
-	uint32_t *p;
+	__be32 *p;
 	int status = 0;
 
 	*res = 0;
@@ -2742,7 +2746,7 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 
 static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
 {
-	uint32_t *p;
+	__be32 *p;
 	int status = 0;
 
 	*res = 0;
@@ -2759,7 +2763,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 
 static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	*used = 0;
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
@@ -2776,7 +2780,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
 
 static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
 {
-	uint32_t *p;
+	__be32 *p;
 	uint64_t sec;
 	uint32_t nsec;
 
@@ -2836,7 +2840,7 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
 	return status;
 }
 
-static int verify_attr_len(struct xdr_stream *xdr, uint32_t *savep, uint32_t attrlen)
+static int verify_attr_len(struct xdr_stream *xdr, __be32 *savep, uint32_t attrlen)
 {
 	unsigned int attrwords = XDR_QUADLEN(attrlen);
 	unsigned int nwords = xdr->p - savep;
@@ -2854,7 +2858,7 @@ static int verify_attr_len(struct xdr_stream *xdr, uint32_t *savep, uint32_t att
 
 static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 {
-	uint32_t *p;
+	__be32 *p;
 
 	READ_BUF(20);
 	READ32(cinfo->atomic);
@@ -2865,7 +2869,7 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
 
 static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
 {
-	uint32_t *p;
+	__be32 *p;
 	uint32_t supp, acc;
 	int status;
 
@@ -2882,7 +2886,7 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
 
 static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
 {
-	uint32_t *p;
+	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_CLOSE);
@@ -2895,7 +2899,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
 
 static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
 {
-	uint32_t *p;
+	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_COMMIT);
@@ -2908,7 +2912,7 @@ static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
 
 static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 {
-	uint32_t *p;
+	__be32 *p;
 	uint32_t bmlen;
 	int status;
 
@@ -2925,7 +2929,7 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 
 static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
 {
-	uint32_t *savep;
+	__be32 *savep;
 	uint32_t attrlen, 
 		 bitmap[2] = {0};
 	int status;
@@ -2952,7 +2956,7 @@ xdr_error:
 	
 static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
 {
-	uint32_t *savep;
+	__be32 *savep;
 	uint32_t attrlen, 
 		 bitmap[2] = {0};
 	int status;
@@ -2985,7 +2989,7 @@ xdr_error:
 
 static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
 {
-	uint32_t *savep;
+	__be32 *savep;
 	uint32_t attrlen, 
 		 bitmap[2] = {0};
 	int status;
@@ -3010,7 +3014,7 @@ xdr_error:
 
 static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server)
 {
-	uint32_t *savep;
+	__be32 *savep;
 	uint32_t attrlen,
 		 bitmap[2] = {0},
 		 type;
@@ -3079,7 +3083,7 @@ xdr_error:
 
 static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 {
-	uint32_t *savep;
+	__be32 *savep;
 	uint32_t attrlen, bitmap[2];
 	int status;
 
@@ -3111,7 +3115,7 @@ xdr_error:
 
 static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
 {
-	uint32_t *p;
+	__be32 *p;
 	uint32_t len;
 	int status;
 
@@ -3147,7 +3151,7 @@ static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
 static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 {
 	uint64_t offset, length, clientid;
-	uint32_t *p;
+	__be32 *p;
 	uint32_t namelen, type;
 
 	READ_BUF(32);
@@ -3172,7 +3176,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 
 static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
 {
-	uint32_t *p;
+	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_LOCK);
@@ -3195,7 +3199,7 @@ static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
 
 static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
 {
-	uint32_t *p;
+	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_LOCKU);
@@ -3214,7 +3218,7 @@ static int decode_lookup(struct xdr_stream *xdr)
 /* This is too sick! */
 static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 {
-        uint32_t *p;
+        __be32 *p;
 	uint32_t limit_type, nblocks, blocksize;
 
 	READ_BUF(12);
@@ -3233,7 +3237,7 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
 
 static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 {
-        uint32_t *p;
+        __be32 *p;
         uint32_t delegation_type;
 
 	READ_BUF(4);
@@ -3259,7 +3263,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 
 static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 {
-        uint32_t *p;
+        __be32 *p;
         uint32_t bmlen;
         int status;
 
@@ -3287,7 +3291,7 @@ xdr_error:
 
 static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
 {
-        uint32_t *p;
+        __be32 *p;
 	int status;
 
         status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
@@ -3300,7 +3304,7 @@ static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmre
 
 static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
 {
-	uint32_t *p;
+	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
@@ -3324,7 +3328,7 @@ static int decode_putrootfh(struct xdr_stream *xdr)
 static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_readres *res)
 {
 	struct kvec *iov = req->rq_rcv_buf.head;
-	uint32_t *p;
+	__be32 *p;
 	uint32_t count, eof, recvd, hdrlen;
 	int status;
 
@@ -3354,7 +3358,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
 	struct page	*page = *rcvbuf->pages;
 	struct kvec	*iov = rcvbuf->head;
 	unsigned int	nr, pglen = rcvbuf->page_len;
-	uint32_t	*end, *entry, *p, *kaddr;
+	__be32		*end, *entry, *p, *kaddr;
 	uint32_t	len, attrlen, xlen;
 	int 		hdrlen, recvd, status;
 
@@ -3376,7 +3380,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
 	xdr_read_pages(xdr, pglen);
 
 	BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE);
-	kaddr = p = (uint32_t *) kmap_atomic(page, KM_USER0);
+	kaddr = p = kmap_atomic(page, KM_USER0);
 	end = p + ((pglen + readdir->pgbase) >> 2);
 	entry = p;
 	for (nr = 0; *p++; nr++) {
@@ -3428,7 +3432,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
 	struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
 	struct kvec *iov = rcvbuf->head;
 	int hdrlen, len, recvd;
-	uint32_t *p;
+	__be32 *p;
 	char *kaddr;
 	int status;
 
@@ -3505,7 +3509,7 @@ decode_restorefh(struct xdr_stream *xdr)
 static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
 		size_t *acl_len)
 {
-	uint32_t *savep;
+	__be32 *savep;
 	uint32_t attrlen,
 		 bitmap[2] = {0};
 	struct kvec *iov = req->rq_rcv_buf.head;
@@ -3551,7 +3555,7 @@ decode_savefh(struct xdr_stream *xdr)
 
 static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res)
 {
-	uint32_t *p;
+	__be32 *p;
 	uint32_t bmlen;
 	int status;
 
@@ -3567,7 +3571,7 @@ static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res)
 
 static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
 {
-	uint32_t *p;
+	__be32 *p;
 	uint32_t opnum;
 	int32_t nfserr;
 
@@ -3610,7 +3614,7 @@ static int decode_setclientid_confirm(struct xdr_stream *xdr)
 
 static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
 {
-	uint32_t *p;
+	__be32 *p;
 	int status;
 
 	status = decode_op_hdr(xdr, OP_WRITE);
@@ -3632,7 +3636,7 @@ static int decode_delegreturn(struct xdr_stream *xdr)
 /*
  * Decode OPEN_DOWNGRADE response
  */
-static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
 {
         struct xdr_stream xdr;
         struct compound_hdr hdr;
@@ -3660,7 +3664,7 @@ out:
 /*
  * Decode ACCESS response
  */
-static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_accessres *res)
+static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -3678,7 +3682,7 @@ out:
 /*
  * Decode LOOKUP response
  */
-static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -3701,7 +3705,7 @@ out:
 /*
  * Decode LOOKUP_ROOT response
  */
-static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_lookup_res *res)
+static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lookup_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -3721,7 +3725,7 @@ out:
 /*
  * Decode REMOVE response
  */
-static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_remove_res *res)
+static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_remove_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -3742,7 +3746,7 @@ out:
 /*
  * Decode RENAME response
  */
-static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_rename_res *res)
+static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_rename_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -3772,7 +3776,7 @@ out:
 /*
  * Decode LINK response
  */
-static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_link_res *res)
+static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -3805,7 +3809,7 @@ out:
 /*
  * Decode CREATE response
  */
-static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -3834,7 +3838,7 @@ out:
 /*
  * Decode SYMLINK response
  */
-static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_create_res *res)
+static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_create_res *res)
 {
 	return nfs4_xdr_dec_create(rqstp, p, res);
 }
@@ -3842,7 +3846,7 @@ static int nfs4_xdr_dec_symlink(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4
 /*
  * Decode GETATTR response
  */
-static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_getattr_res *res)
+static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_getattr_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -3865,7 +3869,7 @@ out:
  * Encode an SETACL request
  */
 static int
-nfs4_xdr_enc_setacl(struct rpc_rqst *req, uint32_t *p, struct nfs_setaclargs *args)
+nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args)
 {
         struct xdr_stream xdr;
         struct compound_hdr hdr = {
@@ -3886,7 +3890,7 @@ out:
  * Decode SETACL response
  */
 static int
-nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, uint32_t *p, void *res)
+nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, __be32 *p, void *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -3908,7 +3912,7 @@ out:
  * Decode GETACL response
  */
 static int
-nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, uint32_t *p, size_t *acl_len)
+nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, __be32 *p, size_t *acl_len)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -3930,7 +3934,7 @@ out:
 /*
  * Decode CLOSE response
  */
-static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_closeres *res)
+static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
 {
         struct xdr_stream xdr;
         struct compound_hdr hdr;
@@ -3960,7 +3964,7 @@ out:
 /*
  * Decode OPEN response
  */
-static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
 {
         struct xdr_stream xdr;
         struct compound_hdr hdr;
@@ -3994,7 +3998,7 @@ out:
 /*
  * Decode OPEN_CONFIRM response
  */
-static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_open_confirmres *res)
+static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res)
 {
         struct xdr_stream xdr;
         struct compound_hdr hdr;
@@ -4015,7 +4019,7 @@ out:
 /*
  * Decode OPEN response
  */
-static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_openres *res)
+static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
 {
         struct xdr_stream xdr;
         struct compound_hdr hdr;
@@ -4039,7 +4043,7 @@ out:
 /*
  * Decode SETATTR response
  */
-static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_setattrres *res)
+static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res)
 {
         struct xdr_stream xdr;
         struct compound_hdr hdr;
@@ -4065,7 +4069,7 @@ out:
 /*
  * Decode LOCK response
  */
-static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lock_res *res)
+static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lock_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4086,7 +4090,7 @@ out:
 /*
  * Decode LOCKT response
  */
-static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockt_res *res)
+static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, __be32 *p, struct nfs_lockt_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4107,7 +4111,7 @@ out:
 /*
  * Decode LOCKU response
  */
-static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_locku_res *res)
+static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, __be32 *p, struct nfs_locku_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4128,7 +4132,7 @@ out:
 /*
  * Decode READLINK response
  */
-static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, uint32_t *p, void *res)
+static int nfs4_xdr_dec_readlink(struct rpc_rqst *rqstp, __be32 *p, void *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4149,7 +4153,7 @@ out:
 /*
  * Decode READDIR response
  */
-static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_readdir_res *res)
+static int nfs4_xdr_dec_readdir(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_readdir_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4170,7 +4174,7 @@ out:
 /*
  * Decode Read response
  */
-static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_readres *res)
+static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, __be32 *p, struct nfs_readres *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4193,7 +4197,7 @@ out:
 /*
  * Decode WRITE response
  */
-static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4219,7 +4223,7 @@ out:
 /*
  * Decode COMMIT response
  */
-static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_writeres *res)
+static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writeres *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4243,7 +4247,7 @@ out:
 /*
  * FSINFO request
  */
-static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs_fsinfo *fsinfo)
+static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4263,7 +4267,7 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs_fsi
 /*
  * PATHCONF request
  */
-static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, uint32_t *p, struct nfs_pathconf *pathconf)
+static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *pathconf)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4281,7 +4285,7 @@ static int nfs4_xdr_dec_pathconf(struct rpc_rqst *req, uint32_t *p, struct nfs_p
 /*
  * STATFS request
  */
-static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, uint32_t *p, struct nfs_fsstat *fsstat)
+static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *fsstat)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4299,7 +4303,7 @@ static int nfs4_xdr_dec_statfs(struct rpc_rqst *req, uint32_t *p, struct nfs_fss
 /*
  * GETATTR_BITMAP request
  */
-static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, uint32_t *p, struct nfs4_server_caps_res *res)
+static int nfs4_xdr_dec_server_caps(struct rpc_rqst *req, __be32 *p, struct nfs4_server_caps_res *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4318,7 +4322,7 @@ out:
 /*
  * Decode RENEW response
  */
-static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, uint32_t *p, void *dummy)
+static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4334,7 +4338,7 @@ static int nfs4_xdr_dec_renew(struct rpc_rqst *rqstp, uint32_t *p, void *dummy)
 /*
  * a SETCLIENTID request
  */
-static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, uint32_t *p,
+static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
 		struct nfs_client *clp)
 {
 	struct xdr_stream xdr;
@@ -4353,7 +4357,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, uint32_t *p,
 /*
  * a SETCLIENTID_CONFIRM request
  */
-static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, struct nfs_fsinfo *fsinfo)
+static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *fsinfo)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4375,7 +4379,7 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, s
 /*
  * DELEGRETURN request
  */
-static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_delegreturnres *res)
+static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_delegreturnres *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4397,7 +4401,7 @@ out:
 /*
  * FS_LOCATIONS request
  */
-static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs_locations *res)
+static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs4_fs_locations *res)
 {
 	struct xdr_stream xdr;
 	struct compound_hdr hdr;
@@ -4417,7 +4421,7 @@ out:
 	return status;
 }
 
-uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus)
+__be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus)
 {
 	uint32_t bitmap[2] = {0};
 	uint32_t len;
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index c0a754ecdee6..8dfefe41a8da 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -69,7 +69,6 @@
  *	Fabian Frederick:	Option parser rebuilt (using parser lib)
 */
 
-#include <linux/config.h>
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
@@ -312,7 +311,7 @@ static int __init root_nfs_name(char *name)
 	/* Override them by options set on kernel command-line */
 	root_nfs_parse(name, buf);
 
-	cp = system_utsname.nodename;
+	cp = utsname()->nodename;
 	if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) {
 		printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n");
 		return -1;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 36e902a88ca1..829af323f288 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -392,7 +392,6 @@ int __init nfs_init_nfspagecache(void)
 
 void nfs_destroy_nfspagecache(void)
 {
-	if (kmem_cache_destroy(nfs_page_cachep))
-		printk(KERN_INFO "nfs_page: not all structures were freed\n");
+	kmem_cache_destroy(nfs_page_cachep);
 }
 
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 630e50647bbb..4529cc4f3f8f 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -352,7 +352,7 @@ nfs_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir, struct qstr *
 {
 	struct nfs_diropargs	*arg;
 
-	arg = (struct nfs_diropargs *)kmalloc(sizeof(*arg), GFP_KERNEL);
+	arg = kmalloc(sizeof(*arg), GFP_KERNEL);
 	if (!arg)
 		return -ENOMEM;
 	arg->fh = NFS_FH(dir->d_inode);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 69f1549da2b9..c2e49c397a27 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -737,6 +737,5 @@ int __init nfs_init_readpagecache(void)
 void nfs_destroy_readpagecache(void)
 {
 	mempool_destroy(nfs_rdata_mempool);
-	if (kmem_cache_destroy(nfs_rdata_cachep))
-		printk(KERN_INFO "nfs_read_data: not all structures were freed\n");
+	kmem_cache_destroy(nfs_rdata_cachep);
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e8d40030cab4..28108c82b887 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -20,7 +20,6 @@
  *   of another (see nfs_lookup())
  */
 
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
 
@@ -835,7 +834,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 	}
 	/* RFC3530: The default port for NFS is 2049 */
 	if (addr.sin_port == 0)
-		addr.sin_port = NFS_PORT;
+		addr.sin_port = htons(NFS_PORT);
 
 	/* Grab the authentication type */
 	authflavour = RPC_AUTH_UNIX;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c12effb46fe5..883dd4a1c157 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -51,13 +51,14 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/file.h>
-#include <linux/mpage.h>
 #include <linux/writeback.h>
 
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/nfs_page.h>
+#include <linux/backing-dev.h>
+
 #include <asm/uaccess.h>
 #include <linux/smp_lock.h>
 
@@ -396,7 +397,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 out:
 	clear_bit(BDI_write_congested, &bdi->state);
 	wake_up_all(&nfs_write_congestion);
-	writeback_congestion_end();
+	congestion_end(WRITE);
 	return err;
 }
 
@@ -589,10 +590,10 @@ static void nfs_cancel_commit_list(struct list_head *head)
 
 	while(!list_empty(head)) {
 		req = nfs_list_entry(head->next);
+		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
 		nfs_list_remove_request(req);
 		nfs_inode_remove_request(req);
-		dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-		nfs_clear_page_writeback(req);
+		nfs_unlock_request(req);
 	}
 }
 
@@ -1565,7 +1566,6 @@ void nfs_destroy_writepagecache(void)
 {
 	mempool_destroy(nfs_commit_mempool);
 	mempool_destroy(nfs_wdata_mempool);
-	if (kmem_cache_destroy(nfs_wdata_cachep))
-		printk(KERN_INFO "nfs_write_data: not all structures were freed\n");
+	kmem_cache_destroy(nfs_wdata_cachep);
 }
 
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 0c2be8c0307d..c11f5375d7c1 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -46,7 +46,7 @@ xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
 {
 	struct nfsacl_encode_desc *nfsacl_desc =
 		(struct nfsacl_encode_desc *) desc;
-	u32 *p = (u32 *) elem;
+	__be32 *p = elem;
 
 	struct posix_acl_entry *entry =
 		&nfsacl_desc->acl->a_entries[nfsacl_desc->count++];
@@ -127,7 +127,7 @@ xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem)
 {
 	struct nfsacl_decode_desc *nfsacl_desc =
 		(struct nfsacl_decode_desc *) desc;
-	u32 *p = (u32 *) elem;
+	__be32 *p = elem;
 	struct posix_acl_entry *entry;
 
 	if (!nfsacl_desc->acl) {
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 01bc68c628ad..f37df46d2eaa 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -319,12 +319,25 @@ svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
 
 static struct cache_head *export_table[EXPORT_HASHMAX];
 
+static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc)
+{
+	int i;
+
+	for (i = 0; i < fsloc->locations_count; i++) {
+		kfree(fsloc->locations[i].path);
+		kfree(fsloc->locations[i].hosts);
+	}
+	kfree(fsloc->locations);
+}
+
 static void svc_export_put(struct kref *ref)
 {
 	struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
 	dput(exp->ex_dentry);
 	mntput(exp->ex_mnt);
 	auth_domain_put(exp->ex_client);
+	kfree(exp->ex_path);
+	nfsd4_fslocs_free(&exp->ex_fslocs);
 	kfree(exp);
 }
 
@@ -370,7 +383,7 @@ static int check_export(struct inode *inode, int flags)
 	 */
 	if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) &&
 	    !(flags & NFSEXP_FSID)) {
-		dprintk("exp_export: export of non-dev fs without fsid");
+		dprintk("exp_export: export of non-dev fs without fsid\n");
 		return -EINVAL;
 	}
 	if (!inode->i_sb->s_export_op) {
@@ -386,6 +399,69 @@ static int check_export(struct inode *inode, int flags)
 
 }
 
+#ifdef CONFIG_NFSD_V4
+
+static int
+fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc)
+{
+	int len;
+	int migrated, i, err;
+
+	len = qword_get(mesg, buf, PAGE_SIZE);
+	if (len != 5 || memcmp(buf, "fsloc", 5))
+		return 0;
+
+	/* listsize */
+	err = get_int(mesg, &fsloc->locations_count);
+	if (err)
+		return err;
+	if (fsloc->locations_count > MAX_FS_LOCATIONS)
+		return -EINVAL;
+	if (fsloc->locations_count == 0)
+		return 0;
+
+	fsloc->locations = kzalloc(fsloc->locations_count
+			* sizeof(struct nfsd4_fs_location), GFP_KERNEL);
+	if (!fsloc->locations)
+		return -ENOMEM;
+	for (i=0; i < fsloc->locations_count; i++) {
+		/* colon separated host list */
+		err = -EINVAL;
+		len = qword_get(mesg, buf, PAGE_SIZE);
+		if (len <= 0)
+			goto out_free_all;
+		err = -ENOMEM;
+		fsloc->locations[i].hosts = kstrdup(buf, GFP_KERNEL);
+		if (!fsloc->locations[i].hosts)
+			goto out_free_all;
+		err = -EINVAL;
+		/* slash separated path component list */
+		len = qword_get(mesg, buf, PAGE_SIZE);
+		if (len <= 0)
+			goto out_free_all;
+		err = -ENOMEM;
+		fsloc->locations[i].path = kstrdup(buf, GFP_KERNEL);
+		if (!fsloc->locations[i].path)
+			goto out_free_all;
+	}
+	/* migrated */
+	err = get_int(mesg, &migrated);
+	if (err)
+		goto out_free_all;
+	err = -EINVAL;
+	if (migrated < 0 || migrated > 1)
+		goto out_free_all;
+	fsloc->migrated = migrated;
+	return 0;
+out_free_all:
+	nfsd4_fslocs_free(fsloc);
+	return err;
+}
+
+#else /* CONFIG_NFSD_V4 */
+static inline int fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) { return 0; }
+#endif
+
 static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 {
 	/* client path expiry [flags anonuid anongid fsid] */
@@ -398,6 +474,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 	int an_int;
 
 	nd.dentry = NULL;
+	exp.ex_path = NULL;
 
 	if (mesg[mlen-1] != '\n')
 		return -EINVAL;
@@ -428,6 +505,10 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 	exp.ex_client = dom;
 	exp.ex_mnt = nd.mnt;
 	exp.ex_dentry = nd.dentry;
+	exp.ex_path = kstrdup(buf, GFP_KERNEL);
+	err = -ENOMEM;
+	if (!exp.ex_path)
+		goto out;
 
 	/* expiry */
 	err = -EINVAL;
@@ -435,6 +516,11 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 	if (exp.h.expiry_time == 0)
 		goto out;
 
+	/* fs locations */
+	exp.ex_fslocs.locations = NULL;
+	exp.ex_fslocs.locations_count = 0;
+	exp.ex_fslocs.migrated = 0;
+
 	/* flags */
 	err = get_int(&mesg, &an_int);
 	if (err == -ENOENT)
@@ -460,6 +546,10 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 
 		err = check_export(nd.dentry->d_inode, exp.ex_flags);
 		if (err) goto out;
+
+		err = fsloc_parse(&mesg, buf, &exp.ex_fslocs);
+		if (err)
+			goto out;
 	}
 
 	expp = svc_export_lookup(&exp);
@@ -473,6 +563,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 	else
 		exp_put(expp);
  out:
+ 	kfree(exp.ex_path);
 	if (nd.dentry)
 		path_release(&nd);
  out_no_path:
@@ -482,7 +573,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 	return err;
 }
 
-static void exp_flags(struct seq_file *m, int flag, int fsid, uid_t anonu, uid_t anong);
+static void exp_flags(struct seq_file *m, int flag, int fsid,
+		uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fslocs);
 
 static int svc_export_show(struct seq_file *m,
 			   struct cache_detail *cd,
@@ -501,8 +593,8 @@ static int svc_export_show(struct seq_file *m,
 	seq_putc(m, '(');
 	if (test_bit(CACHE_VALID, &h->flags) && 
 	    !test_bit(CACHE_NEGATIVE, &h->flags))
-		exp_flags(m, exp->ex_flags, exp->ex_fsid, 
-			  exp->ex_anon_uid, exp->ex_anon_gid);
+		exp_flags(m, exp->ex_flags, exp->ex_fsid,
+			  exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs);
 	seq_puts(m, ")\n");
 	return 0;
 }
@@ -524,6 +616,10 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
 	new->ex_client = item->ex_client;
 	new->ex_dentry = dget(item->ex_dentry);
 	new->ex_mnt = mntget(item->ex_mnt);
+	new->ex_path = NULL;
+	new->ex_fslocs.locations = NULL;
+	new->ex_fslocs.locations_count = 0;
+	new->ex_fslocs.migrated = 0;
 }
 
 static void export_update(struct cache_head *cnew, struct cache_head *citem)
@@ -535,6 +631,14 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
 	new->ex_anon_uid = item->ex_anon_uid;
 	new->ex_anon_gid = item->ex_anon_gid;
 	new->ex_fsid = item->ex_fsid;
+	new->ex_path = item->ex_path;
+	item->ex_path = NULL;
+	new->ex_fslocs.locations = item->ex_fslocs.locations;
+	item->ex_fslocs.locations = NULL;
+	new->ex_fslocs.locations_count = item->ex_fslocs.locations_count;
+	item->ex_fslocs.locations_count = 0;
+	new->ex_fslocs.migrated = item->ex_fslocs.migrated;
+	item->ex_fslocs.migrated = 0;
 }
 
 static struct cache_head *svc_export_alloc(void)
@@ -1044,40 +1148,32 @@ exp_find(struct auth_domain *clp, int fsid_type, u32 *fsidv,
  * for a given NFSv4 client.   The root is defined to be the
  * export point with fsid==0
  */
-int
+__be32
 exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp,
 	       struct cache_req *creq)
 {
-	struct svc_expkey *fsid_key;
 	struct svc_export *exp;
-	int rv;
+	__be32 rv;
 	u32 fsidv[2];
 
 	mk_fsid_v1(fsidv, 0);
 
-	fsid_key = exp_find_key(clp, 1, fsidv, creq);
-	if (IS_ERR(fsid_key) && PTR_ERR(fsid_key) == -EAGAIN)
+	exp = exp_find(clp, 1, fsidv, creq);
+	if (IS_ERR(exp) && PTR_ERR(exp) == -EAGAIN)
 		return nfserr_dropit;
-	if (!fsid_key || IS_ERR(fsid_key))
-		return nfserr_perm;
-
-	exp = exp_get_by_name(clp, fsid_key->ek_mnt, fsid_key->ek_dentry, creq);
 	if (exp == NULL)
-		rv = nfserr_perm;
+		return nfserr_perm;
 	else if (IS_ERR(exp))
-		rv = nfserrno(PTR_ERR(exp));
-	else {
-		rv = fh_compose(fhp, exp,
-				fsid_key->ek_dentry, NULL);
-		exp_put(exp);
-	}
-	cache_put(&fsid_key->h, &svc_expkey_cache);
+		return nfserrno(PTR_ERR(exp));
+	rv = fh_compose(fhp, exp, exp->ex_dentry, NULL);
+	exp_put(exp);
 	return rv;
 }
 
 /* Iterator */
 
 static void *e_start(struct seq_file *m, loff_t *pos)
+	__acquires(svc_export_cache.hash_lock)
 {
 	loff_t n = *pos;
 	unsigned hash, export;
@@ -1086,7 +1182,7 @@ static void *e_start(struct seq_file *m, loff_t *pos)
 	exp_readlock();
 	read_lock(&svc_export_cache.hash_lock);
 	if (!n--)
-		return (void *)1;
+		return SEQ_START_TOKEN;
 	hash = n >> 32;
 	export = n & ((1LL<<32) - 1);
 
@@ -1110,7 +1206,7 @@ static void *e_next(struct seq_file *m, void *p, loff_t *pos)
 	struct cache_head *ch = p;
 	int hash = (*pos >> 32);
 
-	if (p == (void *)1)
+	if (p == SEQ_START_TOKEN)
 		hash = 0;
 	else if (ch->next == NULL) {
 		hash++;
@@ -1131,6 +1227,7 @@ static void *e_next(struct seq_file *m, void *p, loff_t *pos)
 }
 
 static void e_stop(struct seq_file *m, void *p)
+	__releases(svc_export_cache.hash_lock)
 {
 	read_unlock(&svc_export_cache.hash_lock);
 	exp_readunlock();
@@ -1156,7 +1253,8 @@ static struct flags {
 	{ 0, {"", ""}}
 };
 
-static void exp_flags(struct seq_file *m, int flag, int fsid, uid_t anonu, uid_t anong)
+static void exp_flags(struct seq_file *m, int flag, int fsid,
+		uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc)
 {
 	int first = 0;
 	struct flags *flg;
@@ -1172,21 +1270,34 @@ static void exp_flags(struct seq_file *m, int flag, int fsid, uid_t anonu, uid_t
 		seq_printf(m, "%sanonuid=%d", first++?",":"", anonu);
 	if (anong != (gid_t)-2 && anong != (0x10000-2))
 		seq_printf(m, "%sanongid=%d", first++?",":"", anong);
+	if (fsloc && fsloc->locations_count > 0) {
+		char *loctype = (fsloc->migrated) ? "refer" : "replicas";
+		int i;
+
+		seq_printf(m, "%s%s=", first++?",":"", loctype);
+		seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\");
+		seq_putc(m, '@');
+		seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\");
+		for (i = 1; i < fsloc->locations_count; i++) {
+			seq_putc(m, ';');
+			seq_escape(m, fsloc->locations[i].path, ",;@ \t\n\\");
+			seq_putc(m, '@');
+			seq_escape(m, fsloc->locations[i].hosts, ",;@ \t\n\\");
+		}
+	}
 }
 
 static int e_show(struct seq_file *m, void *p)
 {
 	struct cache_head *cp = p;
 	struct svc_export *exp = container_of(cp, struct svc_export, h);
-	svc_client *clp;
 
-	if (p == (void *)1) {
+	if (p == SEQ_START_TOKEN) {
 		seq_puts(m, "# Version 1.1\n");
 		seq_puts(m, "# Path Client(Flags) # IPs\n");
 		return 0;
 	}
 
-	clp = exp->ex_client;
 	cache_get(&exp->h);
 	if (cache_check(&svc_export_cache, &exp->h, NULL))
 		return 0;
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 7b889ff15ae6..11fdaf7721b4 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -25,7 +25,7 @@
 static u32
 nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
 {
-	u32		nfserr;
+	__be32		nfserr;
 	struct svc_fh	fh;
 
 	/* must initialize before using! but maxsize doesn't matter */
@@ -39,18 +39,20 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
 	fh_put(&fh);
 	rqstp->rq_client = NULL;
 	exp_readunlock();
- 	/* nlm and nfsd don't share error codes.
-	 * we invent: 0 = no error
-	 *            1 = stale file handle
-	 *	      2 = other error
+ 	/* We return nlm error codes as nlm doesn't know
+	 * about nfsd, but nfsd does know about nlm..
 	 */
 	switch (nfserr) {
 	case nfs_ok:
 		return 0;
+	case nfserr_dropit:
+		return nlm_drop_reply;
+#ifdef CONFIG_LOCKD_V4
 	case nfserr_stale:
-		return 1;
+		return nlm4_stale_fh;
+#endif
 	default:
-		return 2;
+		return nlm_lck_denied;
 	}
 }
 
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index fc95c4df6693..e3eca0816986 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nfsd/nfsacl.c
+ * linux/fs/nfsd/nfs2acl.c
  *
  * Process version 2 NFSACL requests.
  *
@@ -21,7 +21,7 @@
 /*
  * NULL call.
  */
-static int
+static __be32
 nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 {
 	return nfs_ok;
@@ -30,12 +30,12 @@ nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 /*
  * Get the Access and/or Default ACL of a file.
  */
-static int nfsacld_proc_getacl(struct svc_rqst * rqstp,
+static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
 		struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
 {
 	svc_fh *fh;
 	struct posix_acl *acl;
-	int nfserr = 0;
+	__be32 nfserr = 0;
 
 	dprintk("nfsd: GETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
 
@@ -97,12 +97,12 @@ fail:
 /*
  * Set the Access and/or Default ACL of a file.
  */
-static int nfsacld_proc_setacl(struct svc_rqst * rqstp,
+static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
 		struct nfsd3_setaclargs *argp,
 		struct nfsd_attrstat *resp)
 {
 	svc_fh *fh;
-	int nfserr = 0;
+	__be32 nfserr = 0;
 
 	dprintk("nfsd: SETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
 
@@ -128,7 +128,7 @@ static int nfsacld_proc_setacl(struct svc_rqst * rqstp,
 /*
  * Check file attributes
  */
-static int nfsacld_proc_getattr(struct svc_rqst * rqstp,
+static __be32 nfsacld_proc_getattr(struct svc_rqst * rqstp,
 		struct nfsd_fhandle *argp, struct nfsd_attrstat *resp)
 {
 	dprintk("nfsd: GETATTR  %s\n", SVCFH_fmt(&argp->fh));
@@ -140,10 +140,10 @@ static int nfsacld_proc_getattr(struct svc_rqst * rqstp,
 /*
  * Check file access
  */
-static int nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
+static __be32 nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
 		struct nfsd3_accessres *resp)
 {
-	int nfserr;
+	__be32 nfserr;
 
 	dprintk("nfsd: ACCESS(2acl)   %s 0x%x\n",
 			SVCFH_fmt(&argp->fh),
@@ -158,7 +158,7 @@ static int nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *
 /*
  * XDR decode functions
  */
-static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p,
 		struct nfsd3_getaclargs *argp)
 {
 	if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
@@ -169,7 +169,7 @@ static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p,
 }
 
 
-static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
 		struct nfsd3_setaclargs *argp)
 {
 	struct kvec *head = rqstp->rq_arg.head;
@@ -194,7 +194,7 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p,
 	return (n > 0);
 }
 
-static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p,
 		struct nfsd_fhandle *argp)
 {
 	if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
@@ -202,7 +202,7 @@ static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, u32 *p,
 	return xdr_argsize_check(rqstp, p);
 }
 
-static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
 		struct nfsd3_accessargs *argp)
 {
 	if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
@@ -217,7 +217,7 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, u32 *p,
  */
 
 /* GETACL */
-static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
 		struct nfsd3_getaclres *resp)
 {
 	struct dentry *dentry = resp->fh.fh_dentry;
@@ -241,7 +241,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
 
 	rqstp->rq_res.page_len = w;
 	while (w > 0) {
-		if (!svc_take_res_page(rqstp))
+		if (!rqstp->rq_respages[rqstp->rq_resused++])
 			return 0;
 		w -= PAGE_SIZE;
 	}
@@ -259,7 +259,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
 	return 1;
 }
 
-static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p,
 		struct nfsd_attrstat *resp)
 {
 	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
@@ -267,7 +267,7 @@ static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, u32 *p,
 }
 
 /* ACCESS */
-static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p,
 		struct nfsd3_accessres *resp)
 {
 	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
@@ -278,7 +278,7 @@ static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, u32 *p,
 /*
  * XDR release functions
  */
-static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, __be32 *p,
 		struct nfsd3_getaclres *resp)
 {
 	fh_put(&resp->fh);
@@ -287,7 +287,7 @@ static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, u32 *p,
 	return 1;
 }
 
-static int nfsaclsvc_release_fhandle(struct svc_rqst *rqstp, u32 *p,
+static int nfsaclsvc_release_fhandle(struct svc_rqst *rqstp, __be32 *p,
 		struct nfsd_fhandle *resp)
 {
 	fh_put(&resp->fh);
@@ -333,4 +333,5 @@ struct svc_version	nfsd_acl_version2 = {
 		.vs_proc	= nfsd_acl_procedures2,
 		.vs_dispatch	= nfsd_dispatch,
 		.vs_xdrsize	= NFS3_SVC_XDRSIZE,
+		.vs_hidden	= 1,
 };
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 16e10c170aed..fcad2895ddb0 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -19,7 +19,7 @@
 /*
  * NULL call.
  */
-static int
+static __be32
 nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 {
 	return nfs_ok;
@@ -28,12 +28,12 @@ nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 /*
  * Get the Access and/or Default ACL of a file.
  */
-static int nfsd3_proc_getacl(struct svc_rqst * rqstp,
+static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
 		struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
 {
 	svc_fh *fh;
 	struct posix_acl *acl;
-	int nfserr = 0;
+	__be32 nfserr = 0;
 
 	fh = fh_copy(&resp->fh, &argp->fh);
 	if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP)))
@@ -93,12 +93,12 @@ fail:
 /*
  * Set the Access and/or Default ACL of a file.
  */
-static int nfsd3_proc_setacl(struct svc_rqst * rqstp,
+static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp,
 		struct nfsd3_setaclargs *argp,
 		struct nfsd3_attrstat *resp)
 {
 	svc_fh *fh;
-	int nfserr = 0;
+	__be32 nfserr = 0;
 
 	fh = fh_copy(&resp->fh, &argp->fh);
 	nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_SATTR);
@@ -122,7 +122,7 @@ static int nfsd3_proc_setacl(struct svc_rqst * rqstp,
 /*
  * XDR decode functions
  */
-static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p,
+static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p,
 		struct nfsd3_getaclargs *args)
 {
 	if (!(p = nfs3svc_decode_fh(p, &args->fh)))
@@ -133,7 +133,7 @@ static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p,
 }
 
 
-static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p,
+static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
 		struct nfsd3_setaclargs *args)
 {
 	struct kvec *head = rqstp->rq_arg.head;
@@ -163,7 +163,7 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p,
  */
 
 /* GETACL */
-static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
+static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
 		struct nfsd3_getaclres *resp)
 {
 	struct dentry *dentry = resp->fh.fh_dentry;
@@ -185,7 +185,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
 
 		rqstp->rq_res.page_len = w;
 		while (w > 0) {
-			if (!svc_take_res_page(rqstp))
+			if (!rqstp->rq_respages[rqstp->rq_resused++])
 				return 0;
 			w -= PAGE_SIZE;
 		}
@@ -208,7 +208,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
 }
 
 /* SETACL */
-static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, u32 *p,
+static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, __be32 *p,
 		struct nfsd3_attrstat *resp)
 {
 	p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh);
@@ -219,7 +219,7 @@ static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, u32 *p,
 /*
  * XDR release functions
  */
-static int nfs3svc_release_getacl(struct svc_rqst *rqstp, u32 *p,
+static int nfs3svc_release_getacl(struct svc_rqst *rqstp, __be32 *p,
 		struct nfsd3_getaclres *resp)
 {
 	fh_put(&resp->fh);
@@ -263,5 +263,6 @@ struct svc_version	nfsd_acl_version3 = {
 		.vs_proc	= nfsd_acl_procedures3,
 		.vs_dispatch	= nfsd_dispatch,
 		.vs_xdrsize	= NFS3_SVC_XDRSIZE,
+		.vs_hidden	= 1,
 };
 
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index f61142afea44..64db601c2bd2 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -43,7 +43,7 @@ static int	nfs3_ftypes[] = {
 /*
  * NULL call.
  */
-static int
+static __be32
 nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 {
 	return nfs_ok;
@@ -52,11 +52,12 @@ nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 /*
  * Get a file's attributes
  */
-static int
+static __be32
 nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
 					   struct nfsd3_attrstat *resp)
 {
-	int	err, nfserr;
+	int	err;
+	__be32	nfserr;
 
 	dprintk("nfsd: GETATTR(3)  %s\n",
 		SVCFH_fmt(&argp->fh));
@@ -76,11 +77,11 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
 /*
  * Set a file's attributes
  */
-static int
+static __be32
 nfsd3_proc_setattr(struct svc_rqst *rqstp, struct nfsd3_sattrargs *argp,
 					   struct nfsd3_attrstat  *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: SETATTR(3)  %s\n",
 				SVCFH_fmt(&argp->fh));
@@ -94,11 +95,11 @@ nfsd3_proc_setattr(struct svc_rqst *rqstp, struct nfsd3_sattrargs *argp,
 /*
  * Look up a path name component
  */
-static int
+static __be32
 nfsd3_proc_lookup(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
 					  struct nfsd3_diropres  *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: LOOKUP(3)   %s %.*s\n",
 				SVCFH_fmt(&argp->fh),
@@ -118,11 +119,11 @@ nfsd3_proc_lookup(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
 /*
  * Check file access
  */
-static int
+static __be32
 nfsd3_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
 					  struct nfsd3_accessres *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: ACCESS(3)   %s 0x%x\n",
 				SVCFH_fmt(&argp->fh),
@@ -137,11 +138,11 @@ nfsd3_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
 /*
  * Read a symlink.
  */
-static int
+static __be32
 nfsd3_proc_readlink(struct svc_rqst *rqstp, struct nfsd3_readlinkargs *argp,
 					   struct nfsd3_readlinkres *resp)
 {
-	int nfserr;
+	__be32 nfserr;
 
 	dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh));
 
@@ -155,11 +156,12 @@ nfsd3_proc_readlink(struct svc_rqst *rqstp, struct nfsd3_readlinkargs *argp,
 /*
  * Read a portion of a file.
  */
-static int
+static __be32
 nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
 				        struct nfsd3_readres  *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
+	u32	max_blocksize = svc_max_payload(rqstp);
 
 	dprintk("nfsd: READ(3) %s %lu bytes at %lu\n",
 				SVCFH_fmt(&argp->fh),
@@ -172,15 +174,15 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
 	 */
 
 	resp->count = argp->count;
-	if (NFSSVC_MAXBLKSIZE < resp->count)
-		resp->count = NFSSVC_MAXBLKSIZE;
+	if (max_blocksize < resp->count)
+		resp->count = max_blocksize;
 
 	svc_reserve(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
 
 	fh_copy(&resp->fh, &argp->fh);
 	nfserr = nfsd_read(rqstp, &resp->fh, NULL,
 				  argp->offset,
-			   	  argp->vec, argp->vlen,
+			   	  rqstp->rq_vec, argp->vlen,
 				  &resp->count);
 	if (nfserr == 0) {
 		struct inode	*inode = resp->fh.fh_dentry->d_inode;
@@ -194,11 +196,11 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
 /*
  * Write data to a file
  */
-static int
+static __be32
 nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
 					 struct nfsd3_writeres  *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: WRITE(3)    %s %d bytes at %ld%s\n",
 				SVCFH_fmt(&argp->fh),
@@ -210,7 +212,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
 	resp->committed = argp->stable;
 	nfserr = nfsd_write(rqstp, &resp->fh, NULL,
 				   argp->offset,
-				   argp->vec, argp->vlen,
+				   rqstp->rq_vec, argp->vlen,
 				   argp->len,
 				   &resp->committed);
 	resp->count = argp->count;
@@ -222,13 +224,13 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
  * At least in theory; we'll see how it fares in practice when the
  * first reports about SunOS compatibility problems start to pour in...
  */
-static int
+static __be32
 nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
 					  struct nfsd3_diropres   *resp)
 {
 	svc_fh		*dirfhp, *newfhp = NULL;
 	struct iattr	*attr;
-	u32		nfserr;
+	__be32		nfserr;
 
 	dprintk("nfsd: CREATE(3)   %s %.*s\n",
 				SVCFH_fmt(&argp->fh),
@@ -264,11 +266,11 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
 /*
  * Make directory. This operation is not idempotent.
  */
-static int
+static __be32
 nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
 					 struct nfsd3_diropres   *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: MKDIR(3)    %s %.*s\n",
 				SVCFH_fmt(&argp->fh),
@@ -284,11 +286,11 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
 	RETURN_STATUS(nfserr);
 }
 
-static int
+static __be32
 nfsd3_proc_symlink(struct svc_rqst *rqstp, struct nfsd3_symlinkargs *argp,
 					   struct nfsd3_diropres    *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: SYMLINK(3)  %s %.*s -> %.*s\n",
 				SVCFH_fmt(&argp->ffh),
@@ -306,11 +308,12 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp, struct nfsd3_symlinkargs *argp,
 /*
  * Make socket/fifo/device.
  */
-static int
+static __be32
 nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp,
 					 struct nfsd3_diropres  *resp)
 {
-	int	nfserr, type;
+	__be32	nfserr;
+	int type;
 	dev_t	rdev = 0;
 
 	dprintk("nfsd: MKNOD(3)    %s %.*s\n",
@@ -342,11 +345,11 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp,
 /*
  * Remove file/fifo/socket etc.
  */
-static int
+static __be32
 nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
 					  struct nfsd3_attrstat  *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: REMOVE(3)   %s %.*s\n",
 				SVCFH_fmt(&argp->fh),
@@ -362,11 +365,11 @@ nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
 /*
  * Remove a directory
  */
-static int
+static __be32
 nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
 					 struct nfsd3_attrstat  *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: RMDIR(3)    %s %.*s\n",
 				SVCFH_fmt(&argp->fh),
@@ -378,11 +381,11 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
 	RETURN_STATUS(nfserr);
 }
 
-static int
+static __be32
 nfsd3_proc_rename(struct svc_rqst *rqstp, struct nfsd3_renameargs *argp,
 					  struct nfsd3_renameres  *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: RENAME(3)   %s %.*s ->\n",
 				SVCFH_fmt(&argp->ffh),
@@ -400,11 +403,11 @@ nfsd3_proc_rename(struct svc_rqst *rqstp, struct nfsd3_renameargs *argp,
 	RETURN_STATUS(nfserr);
 }
 
-static int
+static __be32
 nfsd3_proc_link(struct svc_rqst *rqstp, struct nfsd3_linkargs *argp,
 					struct nfsd3_linkres  *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: LINK(3)     %s ->\n",
 				SVCFH_fmt(&argp->ffh));
@@ -423,11 +426,12 @@ nfsd3_proc_link(struct svc_rqst *rqstp, struct nfsd3_linkargs *argp,
 /*
  * Read a portion of a directory.
  */
-static int
+static __be32
 nfsd3_proc_readdir(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
 					   struct nfsd3_readdirres  *resp)
 {
-	int		nfserr, count;
+	__be32		nfserr;
+	int		count;
 
 	dprintk("nfsd: READDIR(3)  %s %d bytes at %d\n",
 				SVCFH_fmt(&argp->fh),
@@ -458,11 +462,12 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
  * Read a portion of a directory, including file handles and attrs.
  * For now, we choose to ignore the dircount parameter.
  */
-static int
+static __be32
 nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
 					       struct nfsd3_readdirres  *resp)
 {
-	int	nfserr, count = 0;
+	__be32	nfserr;
+	int	count = 0;
 	loff_t	offset;
 	int	i;
 	caddr_t	page_addr = NULL;
@@ -516,11 +521,11 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
 /*
  * Get file system stats
  */
-static int
+static __be32
 nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
 					   struct nfsd3_fsstatres *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: FSSTAT(3)   %s\n",
 				SVCFH_fmt(&argp->fh));
@@ -533,20 +538,21 @@ nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
 /*
  * Get file system info
  */
-static int
+static __be32
 nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
 					   struct nfsd3_fsinfores *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
+	u32	max_blocksize = svc_max_payload(rqstp);
 
 	dprintk("nfsd: FSINFO(3)   %s\n",
 				SVCFH_fmt(&argp->fh));
 
-	resp->f_rtmax  = NFSSVC_MAXBLKSIZE;
-	resp->f_rtpref = NFSSVC_MAXBLKSIZE;
+	resp->f_rtmax  = max_blocksize;
+	resp->f_rtpref = max_blocksize;
 	resp->f_rtmult = PAGE_SIZE;
-	resp->f_wtmax  = NFSSVC_MAXBLKSIZE;
-	resp->f_wtpref = NFSSVC_MAXBLKSIZE;
+	resp->f_wtmax  = max_blocksize;
+	resp->f_wtpref = max_blocksize;
 	resp->f_wtmult = PAGE_SIZE;
 	resp->f_dtpref = PAGE_SIZE;
 	resp->f_maxfilesize = ~(u32) 0;
@@ -574,11 +580,11 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
 /*
  * Get pathconf info for the specified file
  */
-static int
+static __be32
 nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle      *argp,
 					     struct nfsd3_pathconfres *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: PATHCONF(3) %s\n",
 				SVCFH_fmt(&argp->fh));
@@ -617,11 +623,11 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle      *argp,
 /*
  * Commit a file (range) to stable storage.
  */
-static int
+static __be32
 nfsd3_proc_commit(struct svc_rqst * rqstp, struct nfsd3_commitargs *argp,
 					   struct nfsd3_commitres  *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: COMMIT(3)   %s %u@%Lu\n",
 				SVCFH_fmt(&argp->fh),
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 243d94b9653a..b4baca3053c3 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -42,23 +42,23 @@ static u32	nfs3_ftypes[] = {
 /*
  * XDR functions for basic NFS types
  */
-static inline u32 *
-encode_time3(u32 *p, struct timespec *time)
+static inline __be32 *
+encode_time3(__be32 *p, struct timespec *time)
 {
 	*p++ = htonl((u32) time->tv_sec); *p++ = htonl(time->tv_nsec);
 	return p;
 }
 
-static inline u32 *
-decode_time3(u32 *p, struct timespec *time)
+static inline __be32 *
+decode_time3(__be32 *p, struct timespec *time)
 {
 	time->tv_sec = ntohl(*p++);
 	time->tv_nsec = ntohl(*p++);
 	return p;
 }
 
-static inline u32 *
-decode_fh(u32 *p, struct svc_fh *fhp)
+static inline __be32 *
+decode_fh(__be32 *p, struct svc_fh *fhp)
 {
 	unsigned int size;
 	fh_init(fhp, NFS3_FHSIZE);
@@ -72,13 +72,13 @@ decode_fh(u32 *p, struct svc_fh *fhp)
 }
 
 /* Helper function for NFSv3 ACL code */
-u32 *nfs3svc_decode_fh(u32 *p, struct svc_fh *fhp)
+__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp)
 {
 	return decode_fh(p, fhp);
 }
 
-static inline u32 *
-encode_fh(u32 *p, struct svc_fh *fhp)
+static inline __be32 *
+encode_fh(__be32 *p, struct svc_fh *fhp)
 {
 	unsigned int size = fhp->fh_handle.fh_size;
 	*p++ = htonl(size);
@@ -91,8 +91,8 @@ encode_fh(u32 *p, struct svc_fh *fhp)
  * Decode a file name and make sure that the path contains
  * no slashes or null bytes.
  */
-static inline u32 *
-decode_filename(u32 *p, char **namp, int *lenp)
+static inline __be32 *
+decode_filename(__be32 *p, char **namp, int *lenp)
 {
 	char		*name;
 	int		i;
@@ -107,8 +107,8 @@ decode_filename(u32 *p, char **namp, int *lenp)
 	return p;
 }
 
-static inline u32 *
-decode_sattr3(u32 *p, struct iattr *iap)
+static inline __be32 *
+decode_sattr3(__be32 *p, struct iattr *iap)
 {
 	u32	tmp;
 
@@ -153,8 +153,8 @@ decode_sattr3(u32 *p, struct iattr *iap)
 	return p;
 }
 
-static inline u32 *
-encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
+static inline __be32 *
+encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
 	      struct kstat *stat)
 {
 	struct dentry	*dentry = fhp->fh_dentry;
@@ -186,8 +186,8 @@ encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
 	return p;
 }
 
-static inline u32 *
-encode_saved_post_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+static inline __be32 *
+encode_saved_post_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
 {
 	struct inode	*inode = fhp->fh_dentry->d_inode;
 
@@ -224,8 +224,8 @@ encode_saved_post_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
  * The inode may be NULL if the call failed because of a stale file
  * handle. In this case, no attributes are returned.
  */
-static u32 *
-encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+static __be32 *
+encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
 {
 	struct dentry *dentry = fhp->fh_dentry;
 	if (dentry && dentry->d_inode != NULL) {
@@ -243,8 +243,8 @@ encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 }
 
 /* Helper for NFSv3 ACLs */
-u32 *
-nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+__be32 *
+nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
 {
 	return encode_post_op_attr(rqstp, p, fhp);
 }
@@ -252,8 +252,8 @@ nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 /*
  * Enocde weak cache consistency data
  */
-static u32 *
-encode_wcc_data(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+static __be32 *
+encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
 {
 	struct dentry	*dentry = fhp->fh_dentry;
 
@@ -278,7 +278,7 @@ encode_wcc_data(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
  * XDR decode functions
  */
 int
-nfs3svc_decode_fhandle(struct svc_rqst *rqstp, u32 *p, struct nfsd_fhandle *args)
+nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args)
 {
 	if (!(p = decode_fh(p, &args->fh)))
 		return 0;
@@ -286,7 +286,7 @@ nfs3svc_decode_fhandle(struct svc_rqst *rqstp, u32 *p, struct nfsd_fhandle *args
 }
 
 int
-nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_sattrargs *args)
 {
 	if (!(p = decode_fh(p, &args->fh))
@@ -303,7 +303,7 @@ nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfs3svc_decode_diropargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_diropargs *args)
 {
 	if (!(p = decode_fh(p, &args->fh))
@@ -314,7 +314,7 @@ nfs3svc_decode_diropargs(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfs3svc_decode_accessargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_accessargs *args)
 {
 	if (!(p = decode_fh(p, &args->fh)))
@@ -325,11 +325,12 @@ nfs3svc_decode_accessargs(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfs3svc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_readargs *args)
 {
 	unsigned int len;
 	int v,pn;
+	u32 max_blocksize = svc_max_payload(rqstp);
 
 	if (!(p = decode_fh(p, &args->fh))
 	 || !(p = xdr_decode_hyper(p, &args->offset)))
@@ -337,17 +338,16 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
 
 	len = args->count = ntohl(*p++);
 
-	if (len > NFSSVC_MAXBLKSIZE)
-		len = NFSSVC_MAXBLKSIZE;
+	if (len > max_blocksize)
+		len = max_blocksize;
 
 	/* set up the kvec */
 	v=0;
 	while (len > 0) {
-		pn = rqstp->rq_resused;
-		svc_take_page(rqstp);
-		args->vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
-		args->vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE;
-		len -= args->vec[v].iov_len;
+		pn = rqstp->rq_resused++;
+		rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
+		rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE;
+		len -= rqstp->rq_vec[v].iov_len;
 		v++;
 	}
 	args->vlen = v;
@@ -355,10 +355,11 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfs3svc_decode_writeargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_writeargs *args)
 {
 	unsigned int len, v, hdr;
+	u32 max_blocksize = svc_max_payload(rqstp);
 
 	if (!(p = decode_fh(p, &args->fh))
 	 || !(p = xdr_decode_hyper(p, &args->offset)))
@@ -373,26 +374,26 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, u32 *p,
 	    rqstp->rq_arg.len - hdr < len)
 		return 0;
 
-	args->vec[0].iov_base = (void*)p;
-	args->vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
+	rqstp->rq_vec[0].iov_base = (void*)p;
+	rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
 
-	if (len > NFSSVC_MAXBLKSIZE)
-		len = NFSSVC_MAXBLKSIZE;
+	if (len > max_blocksize)
+		len = max_blocksize;
 	v=  0;
-	while (len > args->vec[v].iov_len) {
-		len -= args->vec[v].iov_len;
+	while (len > rqstp->rq_vec[v].iov_len) {
+		len -= rqstp->rq_vec[v].iov_len;
 		v++;
-		args->vec[v].iov_base = page_address(rqstp->rq_argpages[v]);
-		args->vec[v].iov_len = PAGE_SIZE;
+		rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]);
+		rqstp->rq_vec[v].iov_len = PAGE_SIZE;
 	}
-	args->vec[v].iov_len = len;
+	rqstp->rq_vec[v].iov_len = len;
 	args->vlen = v+1;
 
-	return args->count == args->len && args->vec[0].iov_len > 0;
+	return args->count == args->len && rqstp->rq_vec[0].iov_len > 0;
 }
 
 int
-nfs3svc_decode_createargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_createargs *args)
 {
 	if (!(p = decode_fh(p, &args->fh))
@@ -416,7 +417,7 @@ nfs3svc_decode_createargs(struct svc_rqst *rqstp, u32 *p,
 	return xdr_argsize_check(rqstp, p);
 }
 int
-nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_createargs *args)
 {
 	if (!(p = decode_fh(p, &args->fh))
@@ -428,7 +429,7 @@ nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_symlinkargs *args)
 {
 	unsigned int len;
@@ -446,11 +447,11 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p,
 	 * This page appears in the rq_res.pages list, but as pages_len is always
 	 * 0, it won't get in the way
 	 */
-	svc_take_page(rqstp);
 	len = ntohl(*p++);
 	if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE)
 		return 0;
-	args->tname = new = page_address(rqstp->rq_respages[rqstp->rq_resused-1]);
+	args->tname = new =
+		page_address(rqstp->rq_respages[rqstp->rq_resused++]);
 	args->tlen = len;
 	/* first copy and check from the first page */
 	old = (char*)p;
@@ -480,7 +481,7 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_mknodargs *args)
 {
 	if (!(p = decode_fh(p, &args->fh))
@@ -504,7 +505,7 @@ nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfs3svc_decode_renameargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_renameargs *args)
 {
 	if (!(p = decode_fh(p, &args->ffh))
@@ -517,19 +518,19 @@ nfs3svc_decode_renameargs(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_readlinkargs *args)
 {
 	if (!(p = decode_fh(p, &args->fh)))
 		return 0;
-	svc_take_page(rqstp);
-	args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]);
+	args->buffer =
+		page_address(rqstp->rq_respages[rqstp->rq_resused++]);
 
 	return xdr_argsize_check(rqstp, p);
 }
 
 int
-nfs3svc_decode_linkargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_linkargs *args)
 {
 	if (!(p = decode_fh(p, &args->ffh))
@@ -541,7 +542,7 @@ nfs3svc_decode_linkargs(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_readdirargs *args)
 {
 	if (!(p = decode_fh(p, &args->fh)))
@@ -554,17 +555,18 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p,
 	if (args->count > PAGE_SIZE)
 		args->count = PAGE_SIZE;
 
-	svc_take_page(rqstp);
-	args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]);
+	args->buffer =
+		page_address(rqstp->rq_respages[rqstp->rq_resused++]);
 
 	return xdr_argsize_check(rqstp, p);
 }
 
 int
-nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_readdirargs *args)
 {
 	int len, pn;
+	u32 max_blocksize = svc_max_payload(rqstp);
 
 	if (!(p = decode_fh(p, &args->fh)))
 		return 0;
@@ -573,13 +575,12 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, u32 *p,
 	args->dircount = ntohl(*p++);
 	args->count    = ntohl(*p++);
 
-	len = (args->count > NFSSVC_MAXBLKSIZE) ? NFSSVC_MAXBLKSIZE :
+	len = (args->count > max_blocksize) ? max_blocksize :
 						  args->count;
 	args->count = len;
 
 	while (len > 0) {
-		pn = rqstp->rq_resused;
-		svc_take_page(rqstp);
+		pn = rqstp->rq_resused++;
 		if (!args->buffer)
 			args->buffer = page_address(rqstp->rq_respages[pn]);
 		len -= PAGE_SIZE;
@@ -589,7 +590,7 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfs3svc_decode_commitargs(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_commitargs *args)
 {
 	if (!(p = decode_fh(p, &args->fh)))
@@ -608,14 +609,14 @@ nfs3svc_decode_commitargs(struct svc_rqst *rqstp, u32 *p,
  * will work properly.
  */
 int
-nfs3svc_encode_voidres(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nfs3svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
 	return xdr_ressize_check(rqstp, p);
 }
 
 /* GETATTR */
 int
-nfs3svc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_attrstat *resp)
 {
 	if (resp->status == 0)
@@ -625,7 +626,7 @@ nfs3svc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
 
 /* SETATTR, REMOVE, RMDIR */
 int
-nfs3svc_encode_wccstat(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_attrstat *resp)
 {
 	p = encode_wcc_data(rqstp, p, &resp->fh);
@@ -634,7 +635,7 @@ nfs3svc_encode_wccstat(struct svc_rqst *rqstp, u32 *p,
 
 /* LOOKUP */
 int
-nfs3svc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_diropres(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_diropres *resp)
 {
 	if (resp->status == 0) {
@@ -647,7 +648,7 @@ nfs3svc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
 
 /* ACCESS */
 int
-nfs3svc_encode_accessres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_accessres(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_accessres *resp)
 {
 	p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -658,7 +659,7 @@ nfs3svc_encode_accessres(struct svc_rqst *rqstp, u32 *p,
 
 /* READLINK */
 int
-nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_readlinkres *resp)
 {
 	p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -668,7 +669,6 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
 		rqstp->rq_res.page_len = resp->len;
 		if (resp->len & 3) {
 			/* need to pad the tail */
-			rqstp->rq_restailpage = 0;
 			rqstp->rq_res.tail[0].iov_base = p;
 			*p = 0;
 			rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
@@ -680,7 +680,7 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
 
 /* READ */
 int
-nfs3svc_encode_readres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_readres *resp)
 {
 	p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -693,7 +693,6 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, u32 *p,
 		rqstp->rq_res.page_len = resp->count;
 		if (resp->count & 3) {
 			/* need to pad the tail */
-			rqstp->rq_restailpage = 0;
 			rqstp->rq_res.tail[0].iov_base = p;
 			*p = 0;
 			rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3);
@@ -705,7 +704,7 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, u32 *p,
 
 /* WRITE */
 int
-nfs3svc_encode_writeres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_writeres *resp)
 {
 	p = encode_wcc_data(rqstp, p, &resp->fh);
@@ -720,7 +719,7 @@ nfs3svc_encode_writeres(struct svc_rqst *rqstp, u32 *p,
 
 /* CREATE, MKDIR, SYMLINK, MKNOD */
 int
-nfs3svc_encode_createres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_createres(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_diropres *resp)
 {
 	if (resp->status == 0) {
@@ -734,7 +733,7 @@ nfs3svc_encode_createres(struct svc_rqst *rqstp, u32 *p,
 
 /* RENAME */
 int
-nfs3svc_encode_renameres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_renameres *resp)
 {
 	p = encode_wcc_data(rqstp, p, &resp->ffh);
@@ -744,7 +743,7 @@ nfs3svc_encode_renameres(struct svc_rqst *rqstp, u32 *p,
 
 /* LINK */
 int
-nfs3svc_encode_linkres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_linkres *resp)
 {
 	p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -754,7 +753,7 @@ nfs3svc_encode_linkres(struct svc_rqst *rqstp, u32 *p,
 
 /* READDIR */
 int
-nfs3svc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_readdirres *resp)
 {
 	p = encode_post_op_attr(rqstp, p, &resp->fh);
@@ -768,7 +767,6 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
 		rqstp->rq_res.page_len = (resp->count) << 2;
 
 		/* add the 'tail' to the end of the 'head' page - page 0. */
-		rqstp->rq_restailpage = 0;
 		rqstp->rq_res.tail[0].iov_base = p;
 		*p++ = 0;		/* no more entries */
 		*p++ = htonl(resp->common.err == nfserr_eof);
@@ -778,8 +776,8 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
 		return xdr_ressize_check(rqstp, p);
 }
 
-static inline u32 *
-encode_entry_baggage(struct nfsd3_readdirres *cd, u32 *p, const char *name,
+static inline __be32 *
+encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
 	     int namlen, ino_t ino)
 {
 	*p++ = xdr_one;				 /* mark entry present */
@@ -792,8 +790,8 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, u32 *p, const char *name,
 	return p;
 }
 
-static inline u32 *
-encode_entryplus_baggage(struct nfsd3_readdirres *cd, u32 *p,
+static inline __be32 *
+encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p,
 		struct svc_fh *fhp)
 {
 		p = encode_post_op_attr(cd->rqstp, p, fhp);
@@ -855,7 +853,7 @@ encode_entry(struct readdir_cd *ccd, const char *name,
 {
 	struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres,
 		       					common);
-	u32		*p = cd->buffer;
+	__be32		*p = cd->buffer;
 	caddr_t		curr_page_addr = NULL;
 	int		pn;		/* current page number */
 	int		slen;		/* string (name) length */
@@ -921,7 +919,7 @@ encode_entry(struct readdir_cd *ccd, const char *name,
 	} else if (cd->rqstp->rq_respages[pn+1] != NULL) {
 		/* temporarily encode entry into next page, then move back to
 		 * current and next page in rq_respages[] */
-		u32 *p1, *tmp;
+		__be32 *p1, *tmp;
 		int len1, len2;
 
 		/* grab next page for temporary storage of entry */
@@ -1011,7 +1009,7 @@ nfs3svc_encode_entry_plus(struct readdir_cd *cd, const char *name,
 
 /* FSSTAT */
 int
-nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_fsstatres *resp)
 {
 	struct kstatfs	*s = &resp->stats;
@@ -1033,7 +1031,7 @@ nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, u32 *p,
 
 /* FSINFO */
 int
-nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_fsinfores *resp)
 {
 	*p++ = xdr_zero;	/* no post_op_attr */
@@ -1057,7 +1055,7 @@ nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, u32 *p,
 
 /* PATHCONF */
 int
-nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_pathconfres *resp)
 {
 	*p++ = xdr_zero;	/* no post_op_attr */
@@ -1076,7 +1074,7 @@ nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, u32 *p,
 
 /* COMMIT */
 int
-nfs3svc_encode_commitres(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_commitres *resp)
 {
 	p = encode_wcc_data(rqstp, p, &resp->fh);
@@ -1092,7 +1090,7 @@ nfs3svc_encode_commitres(struct svc_rqst *rqstp, u32 *p,
  * XDR release functions
  */
 int
-nfs3svc_release_fhandle(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_release_fhandle(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_attrstat *resp)
 {
 	fh_put(&resp->fh);
@@ -1100,7 +1098,7 @@ nfs3svc_release_fhandle(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfs3svc_release_fhandle2(struct svc_rqst *rqstp, u32 *p,
+nfs3svc_release_fhandle2(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd3_fhandle_pair *resp)
 {
 	fh_put(&resp->fh1);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index edb107e61b91..5d94555cdc83 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -63,6 +63,8 @@
 #define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \
 		| NFS4_ACE_DIRECTORY_INHERIT_ACE | NFS4_ACE_INHERIT_ONLY_ACE)
 
+#define NFS4_SUPPORTED_FLAGS (NFS4_INHERITANCE_FLAGS | NFS4_ACE_IDENTIFIER_GROUP)
+
 #define MASK_EQUAL(mask1, mask2) \
 	( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) )
 
@@ -96,24 +98,26 @@ deny_mask(u32 allow_mask, unsigned int flags)
 /* XXX: modify functions to return NFS errors; they're only ever
  * used by nfs code, after all.... */
 
-static int
-mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags)
+/* We only map from NFSv4 to POSIX ACLs when setting ACLs, when we err on the
+ * side of being more restrictive, so the mode bit mapping below is
+ * pessimistic.  An optimistic version would be needed to handle DENY's,
+ * but we espect to coalesce all ALLOWs and DENYs before mapping to mode
+ * bits. */
+
+static void
+low_mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags)
 {
-	u32 ignore = 0;
+	u32 write_mode = NFS4_WRITE_MODE;
 
-	if (!(flags & NFS4_ACL_DIR))
-		ignore |= NFS4_ACE_DELETE_CHILD; /* ignore it */
-	perm |= ignore;
+	if (flags & NFS4_ACL_DIR)
+		write_mode |= NFS4_ACE_DELETE_CHILD;
 	*mode = 0;
 	if ((perm & NFS4_READ_MODE) == NFS4_READ_MODE)
 		*mode |= ACL_READ;
-	if ((perm & NFS4_WRITE_MODE) == NFS4_WRITE_MODE)
+	if ((perm & write_mode) == write_mode)
 		*mode |= ACL_WRITE;
 	if ((perm & NFS4_EXECUTE_MODE) == NFS4_EXECUTE_MODE)
 		*mode |= ACL_EXECUTE;
-	if (!MASK_EQUAL(perm, ignore|mask_from_posix(*mode, flags)))
-		return -EINVAL;
-	return 0;
 }
 
 struct ace_container {
@@ -338,38 +342,6 @@ sort_pacl(struct posix_acl *pacl)
 	return;
 }
 
-static int
-write_pace(struct nfs4_ace *ace, struct posix_acl *pacl,
-		struct posix_acl_entry **pace, short tag, unsigned int flags)
-{
-	struct posix_acl_entry *this = *pace;
-
-	if (*pace == pacl->a_entries + pacl->a_count)
-		return -EINVAL; /* fell off the end */
-	(*pace)++;
-	this->e_tag = tag;
-	if (tag == ACL_USER_OBJ)
-		flags |= NFS4_ACL_OWNER;
-	if (mode_from_nfs4(ace->access_mask, &this->e_perm, flags))
-		return -EINVAL;
-	this->e_id = (tag == ACL_USER || tag == ACL_GROUP ?
-			ace->who : ACL_UNDEFINED_ID);
-	return 0;
-}
-
-static struct nfs4_ace *
-get_next_v4_ace(struct list_head **p, struct list_head *head)
-{
-	struct nfs4_ace *ace;
-
-	*p = (*p)->next;
-	if (*p == head)
-		return NULL;
-	ace = list_entry(*p, struct nfs4_ace, l_ace);
-
-	return ace;
-}
-
 int
 nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl,
 		struct posix_acl **dpacl, unsigned int flags)
@@ -385,42 +357,23 @@ nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl,
 		goto out;
 
 	error = nfs4_acl_split(acl, dacl);
-	if (error < 0)
+	if (error)
 		goto out_acl;
 
-	if (pacl != NULL) {
-		if (acl->naces == 0) {
-			error = -ENODATA;
-			goto try_dpacl;
-		}
-
-		*pacl = _nfsv4_to_posix_one(acl, flags);
-		if (IS_ERR(*pacl)) {
-			error = PTR_ERR(*pacl);
-			*pacl = NULL;
-			goto out_acl;
-		}
+	*pacl = _nfsv4_to_posix_one(acl, flags);
+	if (IS_ERR(*pacl)) {
+		error = PTR_ERR(*pacl);
+		*pacl = NULL;
+		goto out_acl;
 	}
 
-try_dpacl:
-	if (dpacl != NULL) {
-		if (dacl->naces == 0) {
-			if (pacl == NULL || *pacl == NULL)
-				error = -ENODATA;
-			goto out_acl;
-		}
-
-		error = 0;
-		*dpacl = _nfsv4_to_posix_one(dacl, flags);
-		if (IS_ERR(*dpacl)) {
-			error = PTR_ERR(*dpacl);
-			*dpacl = NULL;
-			goto out_acl;
-		}
+	*dpacl = _nfsv4_to_posix_one(dacl, flags);
+	if (IS_ERR(*dpacl)) {
+		error = PTR_ERR(*dpacl);
+		*dpacl = NULL;
 	}
-
 out_acl:
-	if (error && pacl) {
+	if (error) {
 		posix_acl_release(*pacl);
 		*pacl = NULL;
 	}
@@ -429,349 +382,311 @@ out:
 	return error;
 }
 
+/*
+ * While processing the NFSv4 ACE, this maintains bitmasks representing
+ * which permission bits have been allowed and which denied to a given
+ * entity: */
+struct posix_ace_state {
+	u32 allow;
+	u32 deny;
+};
+
+struct posix_user_ace_state {
+	uid_t uid;
+	struct posix_ace_state perms;
+};
+
+struct posix_ace_state_array {
+	int n;
+	struct posix_user_ace_state aces[];
+};
+
+/*
+ * While processing the NFSv4 ACE, this maintains the partial permissions
+ * calculated so far: */
+
+struct posix_acl_state {
+	struct posix_ace_state owner;
+	struct posix_ace_state group;
+	struct posix_ace_state other;
+	struct posix_ace_state everyone;
+	struct posix_ace_state mask; /* Deny unused in this case */
+	struct posix_ace_state_array *users;
+	struct posix_ace_state_array *groups;
+};
+
 static int
-same_who(struct nfs4_ace *a, struct nfs4_ace *b)
+init_state(struct posix_acl_state *state, int cnt)
 {
-	return a->whotype == b->whotype &&
-		(a->whotype != NFS4_ACL_WHO_NAMED || a->who == b->who);
+	int alloc;
+
+	memset(state, 0, sizeof(struct posix_acl_state));
+	/*
+	 * In the worst case, each individual acl could be for a distinct
+	 * named user or group, but we don't no which, so we allocate
+	 * enough space for either:
+	 */
+	alloc = sizeof(struct posix_ace_state_array)
+		+ cnt*sizeof(struct posix_ace_state);
+	state->users = kzalloc(alloc, GFP_KERNEL);
+	if (!state->users)
+		return -ENOMEM;
+	state->groups = kzalloc(alloc, GFP_KERNEL);
+	if (!state->groups) {
+		kfree(state->users);
+		return -ENOMEM;
+	}
+	return 0;
 }
 
-static int
-complementary_ace_pair(struct nfs4_ace *allow, struct nfs4_ace *deny,
-		unsigned int flags)
-{
-	int ignore = 0;
-	if (!(flags & NFS4_ACL_DIR))
-		ignore |= NFS4_ACE_DELETE_CHILD;
-	return MASK_EQUAL(ignore|deny_mask(allow->access_mask, flags),
-			  ignore|deny->access_mask) &&
-		allow->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE &&
-		deny->type == NFS4_ACE_ACCESS_DENIED_ACE_TYPE &&
-		allow->flag == deny->flag &&
-		same_who(allow, deny);
+static void
+free_state(struct posix_acl_state *state) {
+	kfree(state->users);
+	kfree(state->groups);
 }
 
-static inline int
-user_obj_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
-		struct posix_acl *pacl, struct posix_acl_entry **pace,
-		unsigned int flags)
+static inline void add_to_mask(struct posix_acl_state *state, struct posix_ace_state *astate)
 {
-	int error = -EINVAL;
-	struct nfs4_ace *ace, *ace2;
-
-	ace = get_next_v4_ace(p, &n4acl->ace_head);
-	if (ace == NULL)
-		goto out;
-	if (ace2type(ace) != ACL_USER_OBJ)
-		goto out;
-	error = write_pace(ace, pacl, pace, ACL_USER_OBJ, flags);
-	if (error < 0)
-		goto out;
-	error = -EINVAL;
-	ace2 = get_next_v4_ace(p, &n4acl->ace_head);
-	if (ace2 == NULL)
-		goto out;
-	if (!complementary_ace_pair(ace, ace2, flags))
-		goto out;
-	error = 0;
-out:
-	return error;
+	state->mask.allow |= astate->allow;
 }
 
-static inline int
-users_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
-		struct nfs4_ace **mask_ace,
-		struct posix_acl *pacl, struct posix_acl_entry **pace,
-		unsigned int flags)
-{
-	int error = -EINVAL;
-	struct nfs4_ace *ace, *ace2;
+/*
+ * Certain bits (SYNCHRONIZE, DELETE, WRITE_OWNER, READ/WRITE_NAMED_ATTRS,
+ * READ_ATTRIBUTES, READ_ACL) are currently unenforceable and don't translate
+ * to traditional read/write/execute permissions.
+ *
+ * It's problematic to reject acls that use certain mode bits, because it
+ * places the burden on users to learn the rules about which bits one
+ * particular server sets, without giving the user a lot of help--we return an
+ * error that could mean any number of different things.  To make matters
+ * worse, the problematic bits might be introduced by some application that's
+ * automatically mapping from some other acl model.
+ *
+ * So wherever possible we accept anything, possibly erring on the side of
+ * denying more permissions than necessary.
+ *
+ * However we do reject *explicit* DENY's of a few bits representing
+ * permissions we could never deny:
+ */
 
-	ace = get_next_v4_ace(p, &n4acl->ace_head);
-	if (ace == NULL)
-		goto out;
-	while (ace2type(ace) == ACL_USER) {
-		if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE)
-			goto out;
-		if (*mask_ace &&
-			!MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask))
-			goto out;
-		*mask_ace = ace;
-		ace = get_next_v4_ace(p, &n4acl->ace_head);
-		if (ace == NULL)
-			goto out;
-		if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE)
-			goto out;
-		error = write_pace(ace, pacl, pace, ACL_USER, flags);
-		if (error < 0)
-			goto out;
-		error = -EINVAL;
-		ace2 = get_next_v4_ace(p, &n4acl->ace_head);
-		if (ace2 == NULL)
-			goto out;
-		if (!complementary_ace_pair(ace, ace2, flags))
-			goto out;
-		if ((*mask_ace)->flag != ace2->flag ||
-				!same_who(*mask_ace, ace2))
-			goto out;
-		ace = get_next_v4_ace(p, &n4acl->ace_head);
-		if (ace == NULL)
-			goto out;
-	}
-	error = 0;
-out:
-	return error;
+static inline int check_deny(u32 mask, int isowner)
+{
+	if (mask & (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL))
+		return -EINVAL;
+	if (!isowner)
+		return 0;
+	if (mask & (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL))
+		return -EINVAL;
+	return 0;
 }
 
-static inline int
-group_obj_and_groups_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
-		struct nfs4_ace **mask_ace,
-		struct posix_acl *pacl, struct posix_acl_entry **pace,
-		unsigned int flags)
+static struct posix_acl *
+posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
 {
-	int error = -EINVAL;
-	struct nfs4_ace *ace, *ace2;
-	struct ace_container *ac;
-	struct list_head group_l;
-
-	INIT_LIST_HEAD(&group_l);
-	ace = list_entry(*p, struct nfs4_ace, l_ace);
-
-	/* group owner (mask and allow aces) */
+	struct posix_acl_entry *pace;
+	struct posix_acl *pacl;
+	int nace;
+	int i, error = 0;
 
-	if (pacl->a_count != 3) {
-		/* then the group owner should be preceded by mask */
-		if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE)
-			goto out;
-		if (*mask_ace &&
-			!MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask))
-			goto out;
-		*mask_ace = ace;
-		ace = get_next_v4_ace(p, &n4acl->ace_head);
-		if (ace == NULL)
-			goto out;
+	nace = 4 + state->users->n + state->groups->n;
+	pacl = posix_acl_alloc(nace, GFP_KERNEL);
+	if (!pacl)
+		return ERR_PTR(-ENOMEM);
 
-		if ((*mask_ace)->flag != ace->flag || !same_who(*mask_ace, ace))
-			goto out;
+	pace = pacl->a_entries;
+	pace->e_tag = ACL_USER_OBJ;
+	error = check_deny(state->owner.deny, 1);
+	if (error)
+		goto out_err;
+	low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags);
+	pace->e_id = ACL_UNDEFINED_ID;
+
+	for (i=0; i < state->users->n; i++) {
+		pace++;
+		pace->e_tag = ACL_USER;
+		error = check_deny(state->users->aces[i].perms.deny, 0);
+		if (error)
+			goto out_err;
+		low_mode_from_nfs4(state->users->aces[i].perms.allow,
+					&pace->e_perm, flags);
+		pace->e_id = state->users->aces[i].uid;
+		add_to_mask(state, &state->users->aces[i].perms);
 	}
 
-	if (ace2type(ace) != ACL_GROUP_OBJ)
-		goto out;
-
-	ac = kmalloc(sizeof(*ac), GFP_KERNEL);
-	error = -ENOMEM;
-	if (ac == NULL)
-		goto out;
-	ac->ace = ace;
-	list_add_tail(&ac->ace_l, &group_l);
-
-	error = -EINVAL;
-	if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE)
-		goto out;
-
-	error = write_pace(ace, pacl, pace, ACL_GROUP_OBJ, flags);
-	if (error < 0)
-		goto out;
-
-	error = -EINVAL;
-	ace = get_next_v4_ace(p, &n4acl->ace_head);
-	if (ace == NULL)
-		goto out;
-
-	/* groups (mask and allow aces) */
-
-	while (ace2type(ace) == ACL_GROUP) {
-		if (*mask_ace == NULL)
-			goto out;
-
-		if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE ||
-			!MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask))
-			goto out;
-		*mask_ace = ace;
+	pace++;
+	pace->e_tag = ACL_GROUP_OBJ;
+	error = check_deny(state->group.deny, 0);
+	if (error)
+		goto out_err;
+	low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags);
+	pace->e_id = ACL_UNDEFINED_ID;
+	add_to_mask(state, &state->group);
+
+	for (i=0; i < state->groups->n; i++) {
+		pace++;
+		pace->e_tag = ACL_GROUP;
+		error = check_deny(state->groups->aces[i].perms.deny, 0);
+		if (error)
+			goto out_err;
+		low_mode_from_nfs4(state->groups->aces[i].perms.allow,
+					&pace->e_perm, flags);
+		pace->e_id = state->groups->aces[i].uid;
+		add_to_mask(state, &state->groups->aces[i].perms);
+	}
 
-		ace = get_next_v4_ace(p, &n4acl->ace_head);
-		if (ace == NULL)
-			goto out;
-		ac = kmalloc(sizeof(*ac), GFP_KERNEL);
-		error = -ENOMEM;
-		if (ac == NULL)
-			goto out;
-		error = -EINVAL;
-		if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE ||
-				!same_who(ace, *mask_ace))
-			goto out;
+	pace++;
+	pace->e_tag = ACL_MASK;
+	low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags);
+	pace->e_id = ACL_UNDEFINED_ID;
 
-		ac->ace = ace;
-		list_add_tail(&ac->ace_l, &group_l);
+	pace++;
+	pace->e_tag = ACL_OTHER;
+	error = check_deny(state->other.deny, 0);
+	if (error)
+		goto out_err;
+	low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags);
+	pace->e_id = ACL_UNDEFINED_ID;
 
-		error = write_pace(ace, pacl, pace, ACL_GROUP, flags);
-		if (error < 0)
-			goto out;
-		error = -EINVAL;
-		ace = get_next_v4_ace(p, &n4acl->ace_head);
-		if (ace == NULL)
-			goto out;
-	}
+	return pacl;
+out_err:
+	posix_acl_release(pacl);
+	return ERR_PTR(error);
+}
 
-	/* group owner (deny ace) */
+static inline void allow_bits(struct posix_ace_state *astate, u32 mask)
+{
+	/* Allow all bits in the mask not already denied: */
+	astate->allow |= mask & ~astate->deny;
+}
 
-	if (ace2type(ace) != ACL_GROUP_OBJ)
-		goto out;
-	ac = list_entry(group_l.next, struct ace_container, ace_l);
-	ace2 = ac->ace;
-	if (!complementary_ace_pair(ace2, ace, flags))
-		goto out;
-	list_del(group_l.next);
-	kfree(ac);
+static inline void deny_bits(struct posix_ace_state *astate, u32 mask)
+{
+	/* Deny all bits in the mask not already allowed: */
+	astate->deny |= mask & ~astate->allow;
+}
 
-	/* groups (deny aces) */
+static int find_uid(struct posix_acl_state *state, struct posix_ace_state_array *a, uid_t uid)
+{
+	int i;
 
-	while (!list_empty(&group_l)) {
-		ace = get_next_v4_ace(p, &n4acl->ace_head);
-		if (ace == NULL)
-			goto out;
-		if (ace2type(ace) != ACL_GROUP)
-			goto out;
-		ac = list_entry(group_l.next, struct ace_container, ace_l);
-		ace2 = ac->ace;
-		if (!complementary_ace_pair(ace2, ace, flags))
-			goto out;
-		list_del(group_l.next);
-		kfree(ac);
-	}
+	for (i = 0; i < a->n; i++)
+		if (a->aces[i].uid == uid)
+			return i;
+	/* Not found: */
+	a->n++;
+	a->aces[i].uid = uid;
+	a->aces[i].perms.allow = state->everyone.allow;
+	a->aces[i].perms.deny  = state->everyone.deny;
 
-	ace = get_next_v4_ace(p, &n4acl->ace_head);
-	if (ace == NULL)
-		goto out;
-	if (ace2type(ace) != ACL_OTHER)
-		goto out;
-	error = 0;
-out:
-	while (!list_empty(&group_l)) {
-		ac = list_entry(group_l.next, struct ace_container, ace_l);
-		list_del(group_l.next);
-		kfree(ac);
-	}
-	return error;
+	return i;
 }
 
-static inline int
-mask_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
-		struct nfs4_ace **mask_ace,
-		struct posix_acl *pacl, struct posix_acl_entry **pace,
-		unsigned int flags)
+static void deny_bits_array(struct posix_ace_state_array *a, u32 mask)
 {
-	int error = -EINVAL;
-	struct nfs4_ace *ace;
+	int i;
 
-	ace = list_entry(*p, struct nfs4_ace, l_ace);
-	if (pacl->a_count != 3) {
-		if (*mask_ace == NULL)
-			goto out;
-		(*mask_ace)->access_mask = deny_mask((*mask_ace)->access_mask, flags);
-		write_pace(*mask_ace, pacl, pace, ACL_MASK, flags);
-	}
-	error = 0;
-out:
-	return error;
+	for (i=0; i < a->n; i++)
+		deny_bits(&a->aces[i].perms, mask);
 }
 
-static inline int
-other_from_v4(struct nfs4_acl *n4acl, struct list_head **p,
-		struct posix_acl *pacl, struct posix_acl_entry **pace,
-		unsigned int flags)
+static void allow_bits_array(struct posix_ace_state_array *a, u32 mask)
 {
-	int error = -EINVAL;
-	struct nfs4_ace *ace, *ace2;
+	int i;
 
-	ace = list_entry(*p, struct nfs4_ace, l_ace);
-	if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE)
-		goto out;
-	error = write_pace(ace, pacl, pace, ACL_OTHER, flags);
-	if (error < 0)
-		goto out;
-	error = -EINVAL;
-	ace2 = get_next_v4_ace(p, &n4acl->ace_head);
-	if (ace2 == NULL)
-		goto out;
-	if (!complementary_ace_pair(ace, ace2, flags))
-		goto out;
-	error = 0;
-out:
-	return error;
+	for (i=0; i < a->n; i++)
+		allow_bits(&a->aces[i].perms, mask);
 }
 
-static int
-calculate_posix_ace_count(struct nfs4_acl *n4acl)
+static void process_one_v4_ace(struct posix_acl_state *state,
+				struct nfs4_ace *ace)
 {
-	if (n4acl->naces == 6) /* owner, owner group, and other only */
-		return 3;
-	else { /* Otherwise there must be a mask entry. */
-		/* Also, the remaining entries are for named users and
-		 * groups, and come in threes (mask, allow, deny): */
-		if (n4acl->naces < 7)
-			return -EINVAL;
-		if ((n4acl->naces - 7) % 3)
-			return -EINVAL;
-		return 4 + (n4acl->naces - 7)/3;
+	u32 mask = ace->access_mask;
+	int i;
+
+	switch (ace2type(ace)) {
+	case ACL_USER_OBJ:
+		if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+			allow_bits(&state->owner, mask);
+		} else {
+			deny_bits(&state->owner, mask);
+		}
+		break;
+	case ACL_USER:
+		i = find_uid(state, state->users, ace->who);
+		if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+			allow_bits(&state->users->aces[i].perms, mask);
+		} else {
+			deny_bits(&state->users->aces[i].perms, mask);
+			mask = state->users->aces[i].perms.deny;
+			deny_bits(&state->owner, mask);
+		}
+		break;
+	case ACL_GROUP_OBJ:
+		if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+			allow_bits(&state->group, mask);
+		} else {
+			deny_bits(&state->group, mask);
+			mask = state->group.deny;
+			deny_bits(&state->owner, mask);
+			deny_bits(&state->everyone, mask);
+			deny_bits_array(state->users, mask);
+			deny_bits_array(state->groups, mask);
+		}
+		break;
+	case ACL_GROUP:
+		i = find_uid(state, state->groups, ace->who);
+		if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+			allow_bits(&state->groups->aces[i].perms, mask);
+		} else {
+			deny_bits(&state->groups->aces[i].perms, mask);
+			mask = state->groups->aces[i].perms.deny;
+			deny_bits(&state->owner, mask);
+			deny_bits(&state->group, mask);
+			deny_bits(&state->everyone, mask);
+			deny_bits_array(state->users, mask);
+			deny_bits_array(state->groups, mask);
+		}
+		break;
+	case ACL_OTHER:
+		if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
+			allow_bits(&state->owner, mask);
+			allow_bits(&state->group, mask);
+			allow_bits(&state->other, mask);
+			allow_bits(&state->everyone, mask);
+			allow_bits_array(state->users, mask);
+			allow_bits_array(state->groups, mask);
+		} else {
+			deny_bits(&state->owner, mask);
+			deny_bits(&state->group, mask);
+			deny_bits(&state->other, mask);
+			deny_bits(&state->everyone, mask);
+			deny_bits_array(state->users, mask);
+			deny_bits_array(state->groups, mask);
+		}
 	}
 }
 
-
 static struct posix_acl *
 _nfsv4_to_posix_one(struct nfs4_acl *n4acl, unsigned int flags)
 {
+	struct posix_acl_state state;
 	struct posix_acl *pacl;
-	int error = -EINVAL, nace = 0;
-	struct list_head *p;
-	struct nfs4_ace *mask_ace = NULL;
-	struct posix_acl_entry *pace;
-
-	nace = calculate_posix_ace_count(n4acl);
-	if (nace < 0)
-		goto out_err;
-
-	pacl = posix_acl_alloc(nace, GFP_KERNEL);
-	error = -ENOMEM;
-	if (pacl == NULL)
-		goto out_err;
-
-	pace = &pacl->a_entries[0];
-	p = &n4acl->ace_head;
-
-	error = user_obj_from_v4(n4acl, &p, pacl, &pace, flags);
-	if (error)
-		goto out_acl;
-
-	error = users_from_v4(n4acl, &p, &mask_ace, pacl, &pace, flags);
-	if (error)
-		goto out_acl;
+	struct nfs4_ace *ace;
+	int ret;
 
-	error = group_obj_and_groups_from_v4(n4acl, &p, &mask_ace, pacl, &pace,
-						flags);
-	if (error)
-		goto out_acl;
+	ret = init_state(&state, n4acl->naces);
+	if (ret)
+		return ERR_PTR(ret);
 
-	error = mask_from_v4(n4acl, &p, &mask_ace, pacl, &pace, flags);
-	if (error)
-		goto out_acl;
-	error = other_from_v4(n4acl, &p, pacl, &pace, flags);
-	if (error)
-		goto out_acl;
+	list_for_each_entry(ace, &n4acl->ace_head, l_ace)
+		process_one_v4_ace(&state, ace);
 
-	error = -EINVAL;
-	if (p->next != &n4acl->ace_head)
-		goto out_acl;
-	if (pace != pacl->a_entries + pacl->a_count)
-		goto out_acl;
+	pacl = posix_state_to_acl(&state, flags);
 
-	sort_pacl(pacl);
+	free_state(&state);
 
-	return pacl;
-out_acl:
-	posix_acl_release(pacl);
-out_err:
-	pacl = ERR_PTR(error);
+	if (!IS_ERR(pacl))
+		sort_pacl(pacl);
 	return pacl;
 }
 
@@ -785,22 +700,41 @@ nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl)
 	list_for_each_safe(h, n, &acl->ace_head) {
 		ace = list_entry(h, struct nfs4_ace, l_ace);
 
-		if ((ace->flag & NFS4_INHERITANCE_FLAGS)
-				!= NFS4_INHERITANCE_FLAGS)
-			continue;
+		if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE &&
+		    ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE)
+			return -EINVAL;
 
-		error = nfs4_acl_add_ace(dacl, ace->type, ace->flag,
-				ace->access_mask, ace->whotype, ace->who);
-		if (error < 0)
-			goto out;
+		if (ace->flag & ~NFS4_SUPPORTED_FLAGS)
+			return -EINVAL;
 
-		list_del(h);
-		kfree(ace);
-		acl->naces--;
+		switch (ace->flag & NFS4_INHERITANCE_FLAGS) {
+		case 0:
+			/* Leave this ace in the effective acl: */
+			continue;
+		case NFS4_INHERITANCE_FLAGS:
+			/* Add this ace to the default acl and remove it
+			 * from the effective acl: */
+			error = nfs4_acl_add_ace(dacl, ace->type, ace->flag,
+				ace->access_mask, ace->whotype, ace->who);
+			if (error)
+				return error;
+			list_del(h);
+			kfree(ace);
+			acl->naces--;
+			break;
+		case NFS4_INHERITANCE_FLAGS & ~NFS4_ACE_INHERIT_ONLY_ACE:
+			/* Add this ace to the default, but leave it in
+			 * the effective acl as well: */
+			error = nfs4_acl_add_ace(dacl, ace->type, ace->flag,
+				ace->access_mask, ace->whotype, ace->who);
+			if (error)
+				return error;
+			break;
+		default:
+			return -EINVAL;
+		}
 	}
-
-out:
-	return error;
+	return 0;
 }
 
 static short
@@ -930,23 +864,6 @@ nfs4_acl_write_who(int who, char *p)
 	return -1;
 }
 
-static inline int
-match_who(struct nfs4_ace *ace, uid_t owner, gid_t group, uid_t who)
-{
-	switch (ace->whotype) {
-		case NFS4_ACL_WHO_NAMED:
-			return who == ace->who;
-		case NFS4_ACL_WHO_OWNER:
-			return who == owner;
-		case NFS4_ACL_WHO_GROUP:
-			return who == group;
-		case NFS4_ACL_WHO_EVERYONE:
-			return 1;
-		default:
-			return 0;
-	}
-}
-
 EXPORT_SYMBOL(nfs4_acl_new);
 EXPORT_SYMBOL(nfs4_acl_free);
 EXPORT_SYMBOL(nfs4_acl_add_ace);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 8583d99ee740..f57655a7a2b6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -85,8 +85,8 @@ enum nfs_cb_opnum4 {
 /*
 * Generic encode routines from fs/nfs/nfs4xdr.c
 */
-static inline u32 *
-xdr_writemem(u32 *p, const void *ptr, int nbytes)
+static inline __be32 *
+xdr_writemem(__be32 *p, const void *ptr, int nbytes)
 {
 	int tmp = XDR_QUADLEN(nbytes);
 	if (!tmp)
@@ -131,7 +131,7 @@ xdr_error:                                      \
 #define READ_BUF(nbytes)  do { \
 	p = xdr_inline_decode(xdr, nbytes); \
 	if (!p) { \
-		dprintk("NFSD: %s: reply buffer overflowed in line %d.", \
+		dprintk("NFSD: %s: reply buffer overflowed in line %d.\n", \
 			__FUNCTION__, __LINE__); \
 		return -EIO; \
 	} \
@@ -205,7 +205,7 @@ nfs_cb_stat_to_errno(int stat)
 static int
 encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
 {
-	u32 * p;
+	__be32 * p;
 
 	RESERVE_SPACE(16);
 	WRITE32(0);            /* tag length is always 0 */
@@ -218,7 +218,7 @@ encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
 static int
 encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
 {
-	u32 *p;
+	__be32 *p;
 	int len = cb_rec->cbr_fhlen;
 
 	RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
@@ -231,7 +231,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
 }
 
 static int
-nfs4_xdr_enc_cb_null(struct rpc_rqst *req, u32 *p)
+nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
 {
 	struct xdr_stream xdrs, *xdr = &xdrs;
 
@@ -241,7 +241,7 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, u32 *p)
 }
 
 static int
-nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, u32 *p, struct nfs4_cb_recall *args)
+nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_cb_recall *args)
 {
 	struct xdr_stream xdr;
 	struct nfs4_cb_compound_hdr hdr = {
@@ -257,7 +257,7 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, u32 *p, struct nfs4_cb_recall *args
 
 static int
 decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
-        u32 *p;
+        __be32 *p;
 
         READ_BUF(8);
         READ32(hdr->status);
@@ -272,7 +272,7 @@ decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
 static int
 decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 {
-	u32 *p;
+	__be32 *p;
 	u32 op;
 	int32_t nfserr;
 
@@ -291,13 +291,13 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
 }
 
 static int
-nfs4_xdr_dec_cb_null(struct rpc_rqst *req, u32 *p)
+nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
 {
 	return 0;
 }
 
 static int
-nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, u32 *p)
+nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p)
 {
 	struct xdr_stream xdr;
 	struct nfs4_cb_compound_hdr hdr;
@@ -421,7 +421,7 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 
 	/* Create RPC client */
 	cb->cb_client = rpc_create(&args);
-	if (!cb->cb_client) {
+	if (IS_ERR(cb->cb_client)) {
 		dprintk("NFSD: couldn't create callback client\n");
 		goto out_err;
 	}
@@ -448,10 +448,10 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 out_rpciod:
 	atomic_dec(&clp->cl_count);
 	rpciod_down();
-	cb->cb_client = NULL;
 out_clnt:
 	rpc_shutdown_client(cb->cb_client);
 out_err:
+	cb->cb_client = NULL;
 	dprintk("NFSD: warning: no callback path to client %.*s\n",
 		(int)clp->cl_name.len, clp->cl_name.data);
 }
@@ -461,7 +461,7 @@ nfs4_cb_null(struct rpc_task *task, void *dummy)
 {
 	struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp;
 	struct nfs4_callback *cb = &clp->cl_callback;
-	u32 addr = htonl(cb->cb_addr);
+	__be32 addr = htonl(cb->cb_addr);
 
 	dprintk("NFSD: nfs4_cb_null task->tk_status %d\n", task->tk_status);
 
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index bea6b9478114..b1902ebaab41 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -573,10 +573,9 @@ idmap_lookup(struct svc_rqst *rqstp,
 	struct idmap_defer_req *mdr;
 	int ret;
 
-	mdr = kmalloc(sizeof(*mdr), GFP_KERNEL);
+	mdr = kzalloc(sizeof(*mdr), GFP_KERNEL);
 	if (!mdr)
 		return -ENOMEM;
-	memset(mdr, 0, sizeof(*mdr));
 	atomic_set(&mdr->count, 1);
 	init_waitqueue_head(&mdr->waitq);
 	mdr->req.defer = idmap_defer;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ee4eff27aedc..0a7bbdc4a10a 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -67,32 +67,32 @@ fh_dup2(struct svc_fh *dst, struct svc_fh *src)
 	*dst = *src;
 }
 
-static int
-do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
+static __be32
+do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, int accmode)
 {
-	int accmode, status;
+	__be32 status;
 
 	if (open->op_truncate &&
 		!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
 		return nfserr_inval;
 
-	accmode = MAY_NOP;
 	if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
-		accmode = MAY_READ;
-	if (open->op_share_deny & NFS4_SHARE_ACCESS_WRITE)
+		accmode |= MAY_READ;
+	if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
 		accmode |= (MAY_WRITE | MAY_TRUNC);
-	accmode |= MAY_OWNER_OVERRIDE;
+	if (open->op_share_deny & NFS4_SHARE_DENY_WRITE)
+		accmode |= MAY_WRITE;
 
 	status = fh_verify(rqstp, current_fh, S_IFREG, accmode);
 
 	return status;
 }
 
-static int
+static __be32
 do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
 	struct svc_fh resfh;
-	int status;
+	__be32 status;
 
 	fh_init(&resfh, NFS4_FHSIZE);
 	open->op_truncate = 0;
@@ -124,17 +124,17 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 				&resfh.fh_handle.fh_base,
 				resfh.fh_handle.fh_size);
 
-		status = do_open_permission(rqstp, current_fh, open);
+		status = do_open_permission(rqstp, current_fh, open, MAY_NOP);
 	}
 
 	fh_put(&resfh);
 	return status;
 }
 
-static int
+static __be32
 do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
-	int status;
+	__be32 status;
 
 	/* Only reclaims from previously confirmed clients are valid */
 	if ((status = nfs4_check_open_reclaim(&open->op_clientid)))
@@ -155,16 +155,16 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
 	open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
 		(open->op_iattr.ia_size == 0);
 
-	status = do_open_permission(rqstp, current_fh, open);
+	status = do_open_permission(rqstp, current_fh, open, MAY_OWNER_OVERRIDE);
 
 	return status;
 }
 
 
-static inline int
+static inline __be32
 nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, struct nfs4_stateowner **replay_owner)
 {
-	int status;
+	__be32 status;
 	dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
 		(int)open->op_fname.len, open->op_fname.data,
 		open->op_stateowner);
@@ -177,7 +177,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open
 
 	/* check seqid for replay. set nfs4_owner */
 	status = nfsd4_process_open1(open);
-	if (status == NFSERR_REPLAY_ME) {
+	if (status == nfserr_replay_me) {
 		struct nfs4_replay *rp = &open->op_stateowner->so_replay;
 		fh_put(current_fh);
 		current_fh->fh_handle.fh_size = rp->rp_openfh_len;
@@ -188,7 +188,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open
 			dprintk("nfsd4_open: replay failed"
 				" restoring previous filehandle\n");
 		else
-			status = NFSERR_REPLAY_ME;
+			status = nfserr_replay_me;
 	}
 	if (status)
 		goto out;
@@ -261,7 +261,7 @@ out:
 /*
  * filehandle-manipulating ops.
  */
-static inline int
+static inline __be32
 nfsd4_getfh(struct svc_fh *current_fh, struct svc_fh **getfh)
 {
 	if (!current_fh->fh_dentry)
@@ -271,7 +271,7 @@ nfsd4_getfh(struct svc_fh *current_fh, struct svc_fh **getfh)
 	return nfs_ok;
 }
 
-static inline int
+static inline __be32
 nfsd4_putfh(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_putfh *putfh)
 {
 	fh_put(current_fh);
@@ -280,10 +280,10 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_putf
 	return fh_verify(rqstp, current_fh, 0, MAY_NOP);
 }
 
-static inline int
+static inline __be32
 nfsd4_putrootfh(struct svc_rqst *rqstp, struct svc_fh *current_fh)
 {
-	int status;
+	__be32 status;
 
 	fh_put(current_fh);
 	status = exp_pseudoroot(rqstp->rq_client, current_fh,
@@ -291,7 +291,7 @@ nfsd4_putrootfh(struct svc_rqst *rqstp, struct svc_fh *current_fh)
 	return status;
 }
 
-static inline int
+static inline __be32
 nfsd4_restorefh(struct svc_fh *current_fh, struct svc_fh *save_fh)
 {
 	if (!save_fh->fh_dentry)
@@ -301,7 +301,7 @@ nfsd4_restorefh(struct svc_fh *current_fh, struct svc_fh *save_fh)
 	return nfs_ok;
 }
 
-static inline int
+static inline __be32
 nfsd4_savefh(struct svc_fh *current_fh, struct svc_fh *save_fh)
 {
 	if (!current_fh->fh_dentry)
@@ -314,7 +314,7 @@ nfsd4_savefh(struct svc_fh *current_fh, struct svc_fh *save_fh)
 /*
  * misc nfsv4 ops
  */
-static inline int
+static inline __be32
 nfsd4_access(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_access *access)
 {
 	if (access->ac_req_access & ~NFS3_ACCESS_FULL)
@@ -324,10 +324,10 @@ nfsd4_access(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_acc
 	return nfsd_access(rqstp, current_fh, &access->ac_resp_access, &access->ac_supported);
 }
 
-static inline int
+static inline __be32
 nfsd4_commit(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_commit *commit)
 {
-	int status;
+	__be32 status;
 
 	u32 *p = (u32 *)commit->co_verf.data;
 	*p++ = nfssvc_boot.tv_sec;
@@ -339,11 +339,11 @@ nfsd4_commit(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_com
 	return status;
 }
 
-static int
+static __be32
 nfsd4_create(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_create *create)
 {
 	struct svc_fh resfh;
-	int status;
+	__be32 status;
 	dev_t rdev;
 
 	fh_init(&resfh, NFS4_FHSIZE);
@@ -423,10 +423,10 @@ nfsd4_create(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_cre
 	return status;
 }
 
-static inline int
+static inline __be32
 nfsd4_getattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_getattr *getattr)
 {
-	int status;
+	__be32 status;
 
 	status = fh_verify(rqstp, current_fh, 0, MAY_NOP);
 	if (status)
@@ -442,11 +442,11 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ge
 	return nfs_ok;
 }
 
-static inline int
+static inline __be32
 nfsd4_link(struct svc_rqst *rqstp, struct svc_fh *current_fh,
 	   struct svc_fh *save_fh, struct nfsd4_link *link)
 {
-	int status = nfserr_nofilehandle;
+	__be32 status = nfserr_nofilehandle;
 
 	if (!save_fh->fh_dentry)
 		return status;
@@ -456,11 +456,11 @@ nfsd4_link(struct svc_rqst *rqstp, struct svc_fh *current_fh,
 	return status;
 }
 
-static int
+static __be32
 nfsd4_lookupp(struct svc_rqst *rqstp, struct svc_fh *current_fh)
 {
 	struct svc_fh tmp_fh;
-	int ret;
+	__be32 ret;
 
 	fh_init(&tmp_fh, NFS4_FHSIZE);
 	if((ret = exp_pseudoroot(rqstp->rq_client, &tmp_fh,
@@ -474,16 +474,16 @@ nfsd4_lookupp(struct svc_rqst *rqstp, struct svc_fh *current_fh)
 	return nfsd_lookup(rqstp, current_fh, "..", 2, current_fh);
 }
 
-static inline int
+static inline __be32
 nfsd4_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lookup *lookup)
 {
 	return nfsd_lookup(rqstp, current_fh, lookup->lo_name, lookup->lo_len, current_fh);
 }
 
-static inline int
+static inline __be32
 nfsd4_read(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_read *read)
 {
-	int status;
+	__be32 status;
 
 	/* no need to check permission - this will be done in nfsd_read() */
 
@@ -508,7 +508,7 @@ out:
 	return status;
 }
 
-static inline int
+static inline __be32
 nfsd4_readdir(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_readdir *readdir)
 {
 	u64 cookie = readdir->rd_cookie;
@@ -531,7 +531,7 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_re
 	return nfs_ok;
 }
 
-static inline int
+static inline __be32
 nfsd4_readlink(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_readlink *readlink)
 {
 	readlink->rl_rqstp = rqstp;
@@ -539,10 +539,10 @@ nfsd4_readlink(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_r
 	return nfs_ok;
 }
 
-static inline int
+static inline __be32
 nfsd4_remove(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_remove *remove)
 {
-	int status;
+	__be32 status;
 
 	if (nfs4_in_grace())
 		return nfserr_grace;
@@ -556,11 +556,11 @@ nfsd4_remove(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_rem
 	return status;
 }
 
-static inline int
+static inline __be32
 nfsd4_rename(struct svc_rqst *rqstp, struct svc_fh *current_fh,
 	     struct svc_fh *save_fh, struct nfsd4_rename *rename)
 {
-	int status = nfserr_nofilehandle;
+	__be32 status = nfserr_nofilehandle;
 
 	if (!save_fh->fh_dentry)
 		return status;
@@ -589,10 +589,10 @@ nfsd4_rename(struct svc_rqst *rqstp, struct svc_fh *current_fh,
 	return status;
 }
 
-static inline int
+static inline __be32
 nfsd4_setattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_setattr *setattr)
 {
-	int status = nfs_ok;
+	__be32 status = nfs_ok;
 
 	if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
 		nfs4_lock_state();
@@ -600,7 +600,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_se
 			&setattr->sa_stateid, CHECK_FH | WR_STATE, NULL);
 		nfs4_unlock_state();
 		if (status) {
-			dprintk("NFSD: nfsd4_setattr: couldn't process stateid!");
+			dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
 			return status;
 		}
 	}
@@ -614,13 +614,13 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_se
 	return status;
 }
 
-static inline int
+static inline __be32
 nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_write *write)
 {
 	stateid_t *stateid = &write->wr_stateid;
 	struct file *filp = NULL;
 	u32 *p;
-	int status = nfs_ok;
+	__be32 status = nfs_ok;
 
 	/* no need to check permission - this will be done in nfsd_write() */
 
@@ -646,7 +646,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_writ
 	*p++ = nfssvc_boot.tv_usec;
 
 	status =  nfsd_write(rqstp, current_fh, filp, write->wr_offset,
-			write->wr_vec, write->wr_vlen, write->wr_buflen,
+			rqstp->rq_vec, write->wr_vlen, write->wr_buflen,
 			&write->wr_how_written);
 	if (filp)
 		fput(filp);
@@ -661,12 +661,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_writ
  * attributes matched.  VERIFY is implemented by mapping NFSERR_SAME
  * to NFS_OK after the call; NVERIFY by mapping NFSERR_NOT_SAME to NFS_OK.
  */
-static int
+static __be32
 nfsd4_verify(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_verify *verify)
 {
-	u32 *buf, *p;
+	__be32 *buf, *p;
 	int count;
-	int status;
+	__be32 status;
 
 	status = fh_verify(rqstp, current_fh, 0, MAY_NOP);
 	if (status)
@@ -715,7 +715,7 @@ out_kfree:
 /*
  * NULL call.
  */
-static int
+static __be32
 nfsd4_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 {
 	return nfs_ok;
@@ -731,7 +731,7 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
 /*
  * COMPOUND call.
  */
-static int
+static __be32
 nfsd4_proc_compound(struct svc_rqst *rqstp,
 		    struct nfsd4_compoundargs *args,
 		    struct nfsd4_compoundres *resp)
@@ -741,7 +741,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 	struct svc_fh	*save_fh = NULL;
 	struct nfs4_stateowner *replay_owner = NULL;
 	int		slack_space;    /* in words, not bytes! */
-	int		status;
+	__be32		status;
 
 	status = nfserr_resource;
 	current_fh = kmalloc(sizeof(*current_fh), GFP_KERNEL);
@@ -802,13 +802,29 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 		* SETCLIENTID_CONFIRM, PUTFH and PUTROOTFH
 		* require a valid current filehandle
 		*/
-		if ((!current_fh->fh_dentry) &&
-		   !((op->opnum == OP_PUTFH) || (op->opnum == OP_PUTROOTFH) ||
-		   (op->opnum == OP_SETCLIENTID) ||
-		   (op->opnum == OP_SETCLIENTID_CONFIRM) ||
-		   (op->opnum == OP_RENEW) || (op->opnum == OP_RESTOREFH) ||
-		   (op->opnum == OP_RELEASE_LOCKOWNER))) {
-			op->status = nfserr_nofilehandle;
+		if (!current_fh->fh_dentry) {
+			if (!((op->opnum == OP_PUTFH) ||
+			      (op->opnum == OP_PUTROOTFH) ||
+			      (op->opnum == OP_SETCLIENTID) ||
+			      (op->opnum == OP_SETCLIENTID_CONFIRM) ||
+			      (op->opnum == OP_RENEW) ||
+			      (op->opnum == OP_RESTOREFH) ||
+			      (op->opnum == OP_RELEASE_LOCKOWNER))) {
+				op->status = nfserr_nofilehandle;
+				goto encode_op;
+			}
+		}
+		/* Check must be done at start of each operation, except
+		 * for GETATTR and ops not listed as returning NFS4ERR_MOVED
+		 */
+		else if (current_fh->fh_export->ex_fslocs.migrated &&
+			 !((op->opnum == OP_GETATTR) ||
+			   (op->opnum == OP_PUTROOTFH) ||
+			   (op->opnum == OP_PUTPUBFH) ||
+			   (op->opnum == OP_RENEW) ||
+			   (op->opnum == OP_SETCLIENTID) ||
+			   (op->opnum == OP_RELEASE_LOCKOWNER))) {
+			op->status = nfserr_moved;
 			goto encode_op;
 		}
 		switch (op->opnum) {
@@ -921,7 +937,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 		}
 
 encode_op:
-		if (op->status == NFSERR_REPLAY_ME) {
+		if (op->status == nfserr_replay_me) {
 			op->replay = &replay_owner->so_replay;
 			nfsd4_encode_replay(resp, op);
 			status = op->status = op->replay->rp_status;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index e35d7e52fdeb..e9d07704680e 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -83,13 +83,13 @@ md5_to_hex(char *out, char *md5)
 	*out = '\0';
 }
 
-int
+__be32
 nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
 {
 	struct xdr_netobj cksum;
 	struct hash_desc desc;
 	struct scatterlist sg[1];
-	int status = nfserr_resource;
+	__be32 status = nfserr_resource;
 
 	dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
 			clname->len, clname->data);
@@ -184,7 +184,7 @@ struct dentry_list_arg {
 
 static int
 nfsd4_build_dentrylist(void *arg, const char *name, int namlen,
-		loff_t offset, ino_t ino, unsigned int d_type)
+		loff_t offset, u64 ino, unsigned int d_type)
 {
 	struct dentry_list_arg *dla = arg;
 	struct list_head *dentries = &dla->dentries;
@@ -193,7 +193,7 @@ nfsd4_build_dentrylist(void *arg, const char *name, int namlen,
 	struct dentry_list *child;
 
 	if (name && isdotent(name, namlen))
-		return nfs_ok;
+		return 0;
 	dentry = lookup_one_len(name, parent, namlen);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
@@ -333,14 +333,14 @@ purge_old(struct dentry *parent, struct dentry *child)
 	int status;
 
 	if (nfs4_has_reclaimed_state(child->d_name.name))
-		return nfs_ok;
+		return 0;
 
 	status = nfsd4_clear_clid_dir(parent, child);
 	if (status)
 		printk("failed to remove client recovery directory %s\n",
 				child->d_name.name);
 	/* Keep trying, success or failure: */
-	return nfs_ok;
+	return 0;
 }
 
 void
@@ -365,10 +365,10 @@ load_recdir(struct dentry *parent, struct dentry *child)
 		printk("nfsd4: illegal name %s in recovery directory\n",
 				child->d_name.name);
 		/* Keep trying; maybe the others are OK: */
-		return nfs_ok;
+		return 0;
 	}
 	nfs4_client_to_reclaim(child->d_name.name);
-	return nfs_ok;
+	return 0;
 }
 
 int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9daa0b9feb8d..293b6495829f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -339,8 +339,7 @@ alloc_client(struct xdr_netobj name)
 {
 	struct nfs4_client *clp;
 
-	if ((clp = kmalloc(sizeof(struct nfs4_client), GFP_KERNEL))!= NULL) {
-		memset(clp, 0, sizeof(*clp));
+	if ((clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL))!= NULL) {
 		if ((clp->cl_name.data = kmalloc(name.len, GFP_KERNEL)) != NULL) {
 			memcpy(clp->cl_name.data, name.data, name.len);
 			clp->cl_name.len = name.len;
@@ -711,10 +710,10 @@ out_err:
  *		as described above.
  *
  */
-int
+__be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid)
 {
-	u32 			ip_addr = rqstp->rq_addr.sin_addr.s_addr;
+	__be32 			ip_addr = rqstp->rq_addr.sin_addr.s_addr;
 	struct xdr_netobj 	clname = { 
 		.len = setclid->se_namelen,
 		.data = setclid->se_name,
@@ -722,7 +721,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid)
 	nfs4_verifier		clverifier = setclid->se_verf;
 	unsigned int 		strhashval;
 	struct nfs4_client	*conf, *unconf, *new;
-	int 			status;
+	__be32 			status;
 	char                    dname[HEXDIR_LEN];
 	
 	if (!check_name(clname))
@@ -876,14 +875,14 @@ out:
  *
  * NOTE: callback information will be processed here in a future patch
  */
-int
+__be32
 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_setclientid_confirm *setclientid_confirm)
 {
-	u32 ip_addr = rqstp->rq_addr.sin_addr.s_addr;
+	__be32 ip_addr = rqstp->rq_addr.sin_addr.s_addr;
 	struct nfs4_client *conf, *unconf;
 	nfs4_verifier confirm = setclientid_confirm->sc_confirm; 
 	clientid_t * clid = &setclientid_confirm->sc_clientid;
-	int status;
+	__be32 status;
 
 	if (STALE_CLIENTID(clid))
 		return nfserr_stale_clientid;
@@ -1006,13 +1005,10 @@ alloc_init_file(struct inode *ino)
 static void
 nfsd4_free_slab(kmem_cache_t **slab)
 {
-	int status;
-
 	if (*slab == NULL)
 		return;
-	status = kmem_cache_destroy(*slab);
+	kmem_cache_destroy(*slab);
 	*slab = NULL;
-	WARN_ON(status);
 }
 
 static void
@@ -1284,13 +1280,13 @@ test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
  * Called to check deny when READ with all zero stateid or
  * WRITE with all zero or all one stateid
  */
-static int
+static __be32
 nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
 {
 	struct inode *ino = current_fh->fh_dentry->d_inode;
 	struct nfs4_file *fp;
 	struct nfs4_stateid *stp;
-	int ret;
+	__be32 ret;
 
 	dprintk("NFSD: nfs4_share_conflict\n");
 
@@ -1448,7 +1444,7 @@ static struct lock_manager_operations nfsd_lease_mng_ops = {
 };
 
 
-int
+__be32
 nfsd4_process_open1(struct nfsd4_open *open)
 {
 	clientid_t *clientid = &open->op_clientid;
@@ -1481,7 +1477,7 @@ nfsd4_process_open1(struct nfsd4_open *open)
 	}
 	if (open->op_seqid == sop->so_seqid - 1) {
 		if (sop->so_replay.rp_buflen)
-			return NFSERR_REPLAY_ME;
+			return nfserr_replay_me;
 		/* The original OPEN failed so spectacularly
 		 * that we don't even have replay data saved!
 		 * Therefore, we have no choice but to continue
@@ -1505,7 +1501,7 @@ renew:
 	return nfs_ok;
 }
 
-static inline int
+static inline __be32
 nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
 {
 	if ((flags & WR_STATE) && (dp->dl_type == NFS4_OPEN_DELEGATE_READ))
@@ -1526,12 +1522,12 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
 	return NULL;
 }
 
-static int
+static __be32
 nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
 		struct nfs4_delegation **dp)
 {
 	int flags;
-	int status = nfserr_bad_stateid;
+	__be32 status = nfserr_bad_stateid;
 
 	*dp = find_delegation_file(fp, &open->op_delegate_stateid);
 	if (*dp == NULL)
@@ -1550,11 +1546,11 @@ out:
 	return nfs_ok;
 }
 
-static int
+static __be32
 nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp)
 {
 	struct nfs4_stateid *local;
-	int status = nfserr_share_denied;
+	__be32 status = nfserr_share_denied;
 	struct nfs4_stateowner *sop = open->op_stateowner;
 
 	list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
@@ -1579,7 +1575,7 @@ nfs4_alloc_stateid(void)
 	return kmem_cache_alloc(stateid_slab, GFP_KERNEL);
 }
 
-static int
+static __be32
 nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
 		struct nfs4_delegation *dp,
 		struct svc_fh *cur_fh, int flags)
@@ -1594,7 +1590,7 @@ nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
 		get_file(dp->dl_vfs_file);
 		stp->st_vfs_file = dp->dl_vfs_file;
 	} else {
-		int status;
+		__be32 status;
 		status = nfsd_open(rqstp, cur_fh, S_IFREG, flags,
 				&stp->st_vfs_file);
 		if (status) {
@@ -1608,7 +1604,7 @@ nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
 	return 0;
 }
 
-static inline int
+static inline __be32
 nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
 		struct nfsd4_open *open)
 {
@@ -1623,22 +1619,22 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
 	return nfsd_setattr(rqstp, fh, &iattr, 0, (time_t)0);
 }
 
-static int
+static __be32
 nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
 {
 	struct file *filp = stp->st_vfs_file;
 	struct inode *inode = filp->f_dentry->d_inode;
 	unsigned int share_access, new_writer;
-	int status;
+	__be32 status;
 
 	set_access(&share_access, stp->st_access_bmap);
 	new_writer = (~share_access) & open->op_share_access
 			& NFS4_SHARE_ACCESS_WRITE;
 
 	if (new_writer) {
-		status = get_write_access(inode);
-		if (status)
-			return nfserrno(status);
+		int err = get_write_access(inode);
+		if (err)
+			return nfserrno(err);
 	}
 	status = nfsd4_truncate(rqstp, cur_fh, open);
 	if (status) {
@@ -1742,14 +1738,14 @@ out:
 /*
  * called with nfs4_lock_state() held.
  */
-int
+__be32
 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
 	struct nfs4_file *fp = NULL;
 	struct inode *ino = current_fh->fh_dentry->d_inode;
 	struct nfs4_stateid *stp = NULL;
 	struct nfs4_delegation *dp = NULL;
-	int status;
+	__be32 status;
 
 	status = nfserr_inval;
 	if (!access_valid(open->op_share_access)
@@ -1837,11 +1833,11 @@ static struct work_struct laundromat_work;
 static void laundromat_main(void *);
 static DECLARE_WORK(laundromat_work, laundromat_main, NULL);
 
-int 
+__be32
 nfsd4_renew(clientid_t *clid)
 {
 	struct nfs4_client *clp;
-	int status;
+	__be32 status;
 
 	nfs4_lock_state();
 	dprintk("process_renew(%08x/%08x): starting\n", 
@@ -2000,9 +1996,9 @@ access_permit_write(unsigned long access_bmap)
 }
 
 static
-int nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
+__be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
 {
-        int status = nfserr_openmode;
+        __be32 status = nfserr_openmode;
 
 	if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap)))
                 goto out;
@@ -2013,7 +2009,7 @@ out:
 	return status;
 }
 
-static inline int
+static inline __be32
 check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
 {
 	/* Trying to call delegreturn with a special stateid? Yuch: */
@@ -2047,14 +2043,14 @@ io_during_grace_disallowed(struct inode *inode, int flags)
 /*
 * Checks for stateid operations
 */
-int
+__be32
 nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp)
 {
 	struct nfs4_stateid *stp = NULL;
 	struct nfs4_delegation *dp = NULL;
 	stateid_t *stidp;
 	struct inode *ino = current_fh->fh_dentry->d_inode;
-	int status;
+	__be32 status;
 
 	dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n",
 		stateid->si_boot, stateid->si_stateownerid, 
@@ -2129,7 +2125,7 @@ setlkflg (int type)
 /* 
  * Checks for sequence id mutating operations. 
  */
-static int
+static __be32
 nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
 {
 	struct nfs4_stateid *stp;
@@ -2173,7 +2169,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
 		clientid_t *lockclid = &lock->v.new.clientid;
 		struct nfs4_client *clp = sop->so_client;
 		int lkflg = 0;
-		int status;
+		__be32 status;
 
 		lkflg = setlkflg(lock->lk_type);
 
@@ -2237,7 +2233,7 @@ check_replay:
 	if (seqid == sop->so_seqid - 1) {
 		dprintk("NFSD: preprocess_seqid_op: retransmission?\n");
 		/* indicate replay to calling function */
-		return NFSERR_REPLAY_ME;
+		return nfserr_replay_me;
 	}
 	printk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n",
 			sop->so_seqid, seqid);
@@ -2245,10 +2241,10 @@ check_replay:
 	return nfserr_bad_seqid;
 }
 
-int
+__be32
 nfsd4_open_confirm(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open_confirm *oc, struct nfs4_stateowner **replay_owner)
 {
-	int status;
+	__be32 status;
 	struct nfs4_stateowner *sop;
 	struct nfs4_stateid *stp;
 
@@ -2314,10 +2310,10 @@ reset_union_bmap_deny(unsigned long deny, unsigned long *bmap)
 	}
 }
 
-int
+__be32
 nfsd4_open_downgrade(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open_downgrade *od, struct nfs4_stateowner **replay_owner)
 {
-	int status;
+	__be32 status;
 	struct nfs4_stateid *stp;
 	unsigned int share_access;
 
@@ -2369,10 +2365,10 @@ out:
 /*
  * nfs4_unlock_state() called after encode
  */
-int
+__be32
 nfsd4_close(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_close *close, struct nfs4_stateowner **replay_owner)
 {
-	int status;
+	__be32 status;
 	struct nfs4_stateid *stp;
 
 	dprintk("NFSD: nfsd4_close on file %.*s\n", 
@@ -2408,10 +2404,10 @@ out:
 	return status;
 }
 
-int
+__be32
 nfsd4_delegreturn(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_delegreturn *dr)
 {
-	int status;
+	__be32 status;
 
 	if ((status = fh_verify(rqstp, current_fh, S_IFREG, 0)))
 		goto out;
@@ -2639,7 +2635,7 @@ check_lock_length(u64 offset, u64 length)
 /*
  *  LOCK operation 
  */
-int
+__be32
 nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock *lock, struct nfs4_stateowner **replay_owner)
 {
 	struct nfs4_stateowner *open_sop = NULL;
@@ -2648,8 +2644,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 	struct file *filp;
 	struct file_lock file_lock;
 	struct file_lock conflock;
-	int status = 0;
+	__be32 status = 0;
 	unsigned int strhashval;
+	int err;
 
 	dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
 		(long long) lock->lk_offset,
@@ -2762,13 +2759,14 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 	 * locks_copy_lock: */
 	conflock.fl_ops = NULL;
 	conflock.fl_lmops = NULL;
-	status = posix_lock_file_conf(filp, &file_lock, &conflock);
+	err = posix_lock_file_conf(filp, &file_lock, &conflock);
 	dprintk("NFSD: nfsd4_lock: posix_lock_file_conf status %d\n",status);
-	switch (-status) {
+	switch (-err) {
 	case 0: /* success! */
 		update_stateid(&lock_stp->st_stateid);
 		memcpy(&lock->lk_resp_stateid, &lock_stp->st_stateid, 
 				sizeof(stateid_t));
+		status = 0;
 		break;
 	case (EAGAIN):		/* conflock holds conflicting lock */
 		status = nfserr_denied;
@@ -2779,7 +2777,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 		status = nfserr_deadlock;
 		break;
 	default:        
-		dprintk("NFSD: nfsd4_lock: posix_lock_file_conf() failed! status %d\n",status);
+		dprintk("NFSD: nfsd4_lock: posix_lock_file_conf() failed! status %d\n",err);
 		status = nfserr_resource;
 		break;
 	}
@@ -2797,14 +2795,14 @@ out:
 /*
  * LOCKT operation
  */
-int
+__be32
 nfsd4_lockt(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lockt *lockt)
 {
 	struct inode *inode;
 	struct file file;
 	struct file_lock file_lock;
 	struct file_lock conflock;
-	int status;
+	__be32 status;
 
 	if (nfs4_in_grace())
 		return nfserr_grace;
@@ -2877,13 +2875,14 @@ out:
 	return status;
 }
 
-int
+__be32
 nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_locku *locku, struct nfs4_stateowner **replay_owner)
 {
 	struct nfs4_stateid *stp;
 	struct file *filp = NULL;
 	struct file_lock file_lock;
-	int status;
+	__be32 status;
+	int err;
 						        
 	dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n",
 		(long long) locku->lu_offset,
@@ -2921,8 +2920,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 	/*
 	*  Try to unlock the file in the VFS.
 	*/
-	status = posix_lock_file(filp, &file_lock); 
-	if (status) {
+	err = posix_lock_file(filp, &file_lock);
+	if (err) {
 		dprintk("NFSD: nfs4_locku: posix_lock_file failed!\n");
 		goto out_nfserr;
 	}
@@ -2941,7 +2940,7 @@ out:
 	return status;
 
 out_nfserr:
-	status = nfserrno(status);
+	status = nfserrno(err);
 	goto out;
 }
 
@@ -2969,7 +2968,7 @@ out:
 	return status;
 }
 
-int
+__be32
 nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_release_lockowner *rlockowner)
 {
 	clientid_t *clid = &rlockowner->rl_clientid;
@@ -2978,7 +2977,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_release_lockowner *
 	struct xdr_netobj *owner = &rlockowner->rl_owner;
 	struct list_head matches;
 	int i;
-	int status;
+	__be32 status;
 
 	dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
 		clid->cl_boot, clid->cl_id);
@@ -3115,7 +3114,7 @@ nfs4_find_reclaim_client(clientid_t *clid)
 /*
 * Called from OPEN. Look for clientid in reclaim list.
 */
-int
+__be32
 nfs4_check_open_reclaim(clientid_t *clid)
 {
 	return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 5446a0861d1d..f3f239db04bb 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -60,8 +60,16 @@
 
 #define NFSDDBG_FACILITY		NFSDDBG_XDR
 
-static int
-check_filename(char *str, int len, int err)
+/*
+ * As per referral draft, the fsid for a referral MUST be different from the fsid of the containing
+ * directory in order to indicate to the client that a filesystem boundary is present
+ * We use a fixed fsid for a referral
+ */
+#define NFS4_REFERRAL_FSID_MAJOR	0x8000000ULL
+#define NFS4_REFERRAL_FSID_MINOR	0x8000000ULL
+
+static __be32
+check_filename(char *str, int len, __be32 err)
 {
 	int i;
 
@@ -86,8 +94,8 @@ check_filename(char *str, int len, int err)
  * consistent with the style used in NFSv2/v3...
  */
 #define DECODE_HEAD				\
-	u32 *p;					\
-	int status
+	__be32 *p;				\
+	__be32 status
 #define DECODE_TAIL				\
 	status = 0;				\
 out:						\
@@ -136,13 +144,13 @@ xdr_error:					\
 	}					\
 } while (0)
 
-static u32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes)
+static __be32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes)
 {
 	/* We want more bytes than seem to be available.
 	 * Maybe we need a new page, maybe we have just run out
 	 */
 	int avail = (char*)argp->end - (char*)argp->p;
-	u32 *p;
+	__be32 *p;
 	if (avail + argp->pagelen < nbytes)
 		return NULL;
 	if (avail + PAGE_SIZE < nbytes) /* need more than a page !! */
@@ -189,7 +197,7 @@ defer_free(struct nfsd4_compoundargs *argp,
 	return 0;
 }
 
-static char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes)
+static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
 {
 	void *new = NULL;
 	if (p == argp->tmp) {
@@ -198,8 +206,7 @@ static char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes)
 		p = new;
 		memcpy(p, argp->tmp, nbytes);
 	} else {
-		if (p != argp->tmpp)
-			BUG();
+		BUG_ON(p != argp->tmpp);
 		argp->tmpp = NULL;
 	}
 	if (defer_free(argp, kfree, p)) {
@@ -210,7 +217,7 @@ static char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes)
 }
 
 
-static int
+static __be32
 nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
 {
 	u32 bmlen;
@@ -233,13 +240,14 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr,
     struct nfs4_acl **acl)
 {
 	int expected_len, len = 0;
 	u32 dummy32;
 	char *buf;
+	int host_err;
 
 	DECODE_HEAD;
 	iattr->ia_valid = 0;
@@ -273,7 +281,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
 
 		*acl = nfs4_acl_new();
 		if (*acl == NULL) {
-			status = -ENOMEM;
+			host_err = -ENOMEM;
 			goto out_nfserr;
 		}
 		defer_free(argp, (void (*)(const void *))nfs4_acl_free, *acl);
@@ -288,20 +296,20 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
 			len += XDR_QUADLEN(dummy32) << 2;
 			READMEM(buf, dummy32);
 			ace.whotype = nfs4_acl_get_whotype(buf, dummy32);
-			status = 0;
+			host_err = 0;
 			if (ace.whotype != NFS4_ACL_WHO_NAMED)
 				ace.who = 0;
 			else if (ace.flag & NFS4_ACE_IDENTIFIER_GROUP)
-				status = nfsd_map_name_to_gid(argp->rqstp,
+				host_err = nfsd_map_name_to_gid(argp->rqstp,
 						buf, dummy32, &ace.who);
 			else
-				status = nfsd_map_name_to_uid(argp->rqstp,
+				host_err = nfsd_map_name_to_uid(argp->rqstp,
 						buf, dummy32, &ace.who);
-			if (status)
+			if (host_err)
 				goto out_nfserr;
-			status = nfs4_acl_add_ace(*acl, ace.type, ace.flag,
+			host_err = nfs4_acl_add_ace(*acl, ace.type, ace.flag,
 				 ace.access_mask, ace.whotype, ace.who);
-			if (status)
+			if (host_err)
 				goto out_nfserr;
 		}
 	} else
@@ -320,7 +328,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
 		READ_BUF(dummy32);
 		len += (XDR_QUADLEN(dummy32) << 2);
 		READMEM(buf, dummy32);
-		if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
+		if ((host_err = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
 			goto out_nfserr;
 		iattr->ia_valid |= ATTR_UID;
 	}
@@ -331,7 +339,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
 		READ_BUF(dummy32);
 		len += (XDR_QUADLEN(dummy32) << 2);
 		READMEM(buf, dummy32);
-		if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
+		if ((host_err = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
 			goto out_nfserr;
 		iattr->ia_valid |= ATTR_GID;
 	}
@@ -407,11 +415,11 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
 	DECODE_TAIL;
 
 out_nfserr:
-	status = nfserrno(status);
+	status = nfserrno(host_err);
 	goto out;
 }
 
-static int
+static __be32
 nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access)
 {
 	DECODE_HEAD;
@@ -422,7 +430,7 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
 {
 	DECODE_HEAD;
@@ -437,7 +445,7 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
 }
 
 
-static int
+static __be32
 nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit)
 {
 	DECODE_HEAD;
@@ -449,7 +457,7 @@ nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create)
 {
 	DECODE_HEAD;
@@ -489,7 +497,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
 	DECODE_TAIL;
 }
 
-static inline int
+static inline __be32
 nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
 {
 	DECODE_HEAD;
@@ -501,13 +509,13 @@ nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegretu
 	DECODE_TAIL;
 }
 
-static inline int
+static inline __be32
 nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr)
 {
 	return nfsd4_decode_bitmap(argp, getattr->ga_bmval);
 }
 
-static int
+static __be32
 nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
 {
 	DECODE_HEAD;
@@ -522,7 +530,7 @@ nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
 {
 	DECODE_HEAD;
@@ -561,7 +569,7 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
 {
 	DECODE_HEAD;
@@ -580,7 +588,7 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
 {
 	DECODE_HEAD;
@@ -599,7 +607,7 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup)
 {
 	DECODE_HEAD;
@@ -614,7 +622,7 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 {
 	DECODE_HEAD;
@@ -692,7 +700,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf)
 {
 	DECODE_HEAD;
@@ -706,7 +714,7 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down)
 {
 	DECODE_HEAD;
@@ -722,7 +730,7 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
 {
 	DECODE_HEAD;
@@ -737,7 +745,7 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
 {
 	DECODE_HEAD;
@@ -751,7 +759,7 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir)
 {
 	DECODE_HEAD;
@@ -767,7 +775,7 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove)
 {
 	DECODE_HEAD;
@@ -782,7 +790,7 @@ nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename)
 {
 	DECODE_HEAD;
@@ -802,7 +810,7 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
 {
 	DECODE_HEAD;
@@ -813,7 +821,7 @@ nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
 {
 	DECODE_HEAD;
@@ -827,7 +835,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid)
 {
 	DECODE_HEAD;
@@ -852,7 +860,7 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c)
 {
 	DECODE_HEAD;
@@ -865,7 +873,7 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s
 }
 
 /* Also used for NVERIFY */
-static int
+static __be32
 nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify)
 {
 #if 0
@@ -901,7 +909,7 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
 {
 	int avail;
@@ -927,32 +935,32 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
 		printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); 
 		goto xdr_error;
 	}
-	write->wr_vec[0].iov_base = p;
-	write->wr_vec[0].iov_len = avail;
+	argp->rqstp->rq_vec[0].iov_base = p;
+	argp->rqstp->rq_vec[0].iov_len = avail;
 	v = 0;
 	len = write->wr_buflen;
-	while (len > write->wr_vec[v].iov_len) {
-		len -= write->wr_vec[v].iov_len;
+	while (len > argp->rqstp->rq_vec[v].iov_len) {
+		len -= argp->rqstp->rq_vec[v].iov_len;
 		v++;
-		write->wr_vec[v].iov_base = page_address(argp->pagelist[0]);
+		argp->rqstp->rq_vec[v].iov_base = page_address(argp->pagelist[0]);
 		argp->pagelist++;
 		if (argp->pagelen >= PAGE_SIZE) {
-			write->wr_vec[v].iov_len = PAGE_SIZE;
+			argp->rqstp->rq_vec[v].iov_len = PAGE_SIZE;
 			argp->pagelen -= PAGE_SIZE;
 		} else {
-			write->wr_vec[v].iov_len = argp->pagelen;
+			argp->rqstp->rq_vec[v].iov_len = argp->pagelen;
 			argp->pagelen -= len;
 		}
 	}
-	argp->end = (u32*) (write->wr_vec[v].iov_base + write->wr_vec[v].iov_len);
-	argp->p = (u32*)  (write->wr_vec[v].iov_base + (XDR_QUADLEN(len) << 2));
-	write->wr_vec[v].iov_len = len;
+	argp->end = (__be32*) (argp->rqstp->rq_vec[v].iov_base + argp->rqstp->rq_vec[v].iov_len);
+	argp->p = (__be32*)  (argp->rqstp->rq_vec[v].iov_base + (XDR_QUADLEN(len) << 2));
+	argp->rqstp->rq_vec[v].iov_len = len;
 	write->wr_vlen = v+1;
 
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner)
 {
 	DECODE_HEAD;
@@ -966,7 +974,7 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
 	DECODE_TAIL;
 }
 
-static int
+static __be32
 nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 {
 	DECODE_HEAD;
@@ -1172,7 +1180,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
  * task to translate them into Linux-specific versions which are more
  * consistent with the style used in NFSv2/v3...
  */
-#define ENCODE_HEAD              u32 *p
+#define ENCODE_HEAD              __be32 *p
 
 #define WRITE32(n)               *p++ = htonl(n)
 #define WRITE64(n)               do {				\
@@ -1202,8 +1210,8 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
  * Header routine to setup seqid operation replay cache
  */
 #define ENCODE_SEQID_OP_HEAD					\
-	u32 *p;							\
-	u32 *save;						\
+	__be32 *p;						\
+	__be32 *save;						\
 								\
 	save = resp->p;
 
@@ -1224,6 +1232,120 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
  			stateowner->so_replay.rp_buflen); 	\
 	} } while (0);
 
+/* Encode as an array of strings the string given with components
+ * seperated @sep.
+ */
+static __be32 nfsd4_encode_components(char sep, char *components,
+				   __be32 **pp, int *buflen)
+{
+	__be32 *p = *pp;
+	__be32 *countp = p;
+	int strlen, count=0;
+	char *str, *end;
+
+	dprintk("nfsd4_encode_components(%s)\n", components);
+	if ((*buflen -= 4) < 0)
+		return nfserr_resource;
+	WRITE32(0); /* We will fill this in with @count later */
+	end = str = components;
+	while (*end) {
+		for (; *end && (*end != sep); end++)
+			; /* Point to end of component */
+		strlen = end - str;
+		if (strlen) {
+			if ((*buflen -= ((XDR_QUADLEN(strlen) << 2) + 4)) < 0)
+				return nfserr_resource;
+			WRITE32(strlen);
+			WRITEMEM(str, strlen);
+			count++;
+		}
+		else
+			end++;
+		str = end;
+	}
+	*pp = p;
+	p = countp;
+	WRITE32(count);
+	return 0;
+}
+
+/*
+ * encode a location element of a fs_locations structure
+ */
+static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
+				    __be32 **pp, int *buflen)
+{
+	__be32 status;
+	__be32 *p = *pp;
+
+	status = nfsd4_encode_components(':', location->hosts, &p, buflen);
+	if (status)
+		return status;
+	status = nfsd4_encode_components('/', location->path, &p, buflen);
+	if (status)
+		return status;
+	*pp = p;
+	return 0;
+}
+
+/*
+ * Return the path to an export point in the pseudo filesystem namespace
+ * Returned string is safe to use as long as the caller holds a reference
+ * to @exp.
+ */
+static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat)
+{
+	struct svc_fh tmp_fh;
+	char *path, *rootpath;
+
+	fh_init(&tmp_fh, NFS4_FHSIZE);
+	*stat = exp_pseudoroot(rqstp->rq_client, &tmp_fh, &rqstp->rq_chandle);
+	if (*stat)
+		return NULL;
+	rootpath = tmp_fh.fh_export->ex_path;
+
+	path = exp->ex_path;
+
+	if (strncmp(path, rootpath, strlen(rootpath))) {
+		printk("nfsd: fs_locations failed;"
+			"%s is not contained in %s\n", path, rootpath);
+		*stat = nfserr_notsupp;
+		return NULL;
+	}
+
+	return path + strlen(rootpath);
+}
+
+/*
+ *  encode a fs_locations structure
+ */
+static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
+				     struct svc_export *exp,
+				     __be32 **pp, int *buflen)
+{
+	__be32 status;
+	int i;
+	__be32 *p = *pp;
+	struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
+	char *root = nfsd4_path(rqstp, exp, &status);
+
+	if (status)
+		return status;
+	status = nfsd4_encode_components('/', root, &p, buflen);
+	if (status)
+		return status;
+	if ((*buflen -= 4) < 0)
+		return nfserr_resource;
+	WRITE32(fslocs->locations_count);
+	for (i=0; i<fslocs->locations_count; i++) {
+		status = nfsd4_encode_fs_location4(&fslocs->locations[i],
+						   &p, buflen);
+		if (status)
+			return status;
+	}
+	*pp = p;
+	return 0;
+}
 
 static u32 nfs4_ftypes[16] = {
         NF4BAD,  NF4FIFO, NF4CHR, NF4BAD,
@@ -1232,9 +1354,9 @@ static u32 nfs4_ftypes[16] = {
         NF4SOCK, NF4BAD,  NF4LNK, NF4BAD,
 };
 
-static int
+static __be32
 nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
-			u32 **p, int *buflen)
+			__be32 **p, int *buflen)
 {
 	int status;
 
@@ -1254,25 +1376,44 @@ nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
 	return 0;
 }
 
-static inline int
-nfsd4_encode_user(struct svc_rqst *rqstp, uid_t uid, u32 **p, int *buflen)
+static inline __be32
+nfsd4_encode_user(struct svc_rqst *rqstp, uid_t uid, __be32 **p, int *buflen)
 {
 	return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, uid, 0, p, buflen);
 }
 
-static inline int
-nfsd4_encode_group(struct svc_rqst *rqstp, uid_t gid, u32 **p, int *buflen)
+static inline __be32
+nfsd4_encode_group(struct svc_rqst *rqstp, uid_t gid, __be32 **p, int *buflen)
 {
 	return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, gid, 1, p, buflen);
 }
 
-static inline int
+static inline __be32
 nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
-		u32 **p, int *buflen)
+		__be32 **p, int *buflen)
 {
 	return nfsd4_encode_name(rqstp, whotype, id, group, p, buflen);
 }
 
+#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
+			      FATTR4_WORD0_RDATTR_ERROR)
+#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID
+
+static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
+{
+	/* As per referral draft:  */
+	if (*bmval0 & ~WORD0_ABSENT_FS_ATTRS ||
+	    *bmval1 & ~WORD1_ABSENT_FS_ATTRS) {
+		if (*bmval0 & FATTR4_WORD0_RDATTR_ERROR ||
+	            *bmval0 & FATTR4_WORD0_FS_LOCATIONS)
+			*rdattr_err = NFSERR_MOVED;
+		else
+			return nfserr_moved;
+	}
+	*bmval0 &= WORD0_ABSENT_FS_ATTRS;
+	*bmval1 &= WORD1_ABSENT_FS_ATTRS;
+	return 0;
+}
 
 /*
  * Note: @fhp can be NULL; in this case, we might have to compose the filehandle
@@ -1281,9 +1422,9 @@ nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group,
  * @countp is the buffer size in _words_; upon successful return this becomes
  * replaced with the number of words written.
  */
-int
+__be32
 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
-		struct dentry *dentry, u32 *buffer, int *countp, u32 *bmval,
+		struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval,
 		struct svc_rqst *rqstp)
 {
 	u32 bmval0 = bmval[0];
@@ -1292,11 +1433,13 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	struct svc_fh tempfh;
 	struct kstatfs statfs;
 	int buflen = *countp << 2;
-	u32 *attrlenp;
+	__be32 *attrlenp;
 	u32 dummy;
 	u64 dummy64;
-	u32 *p = buffer;
-	int status;
+	u32 rdattr_err = 0;
+	__be32 *p = buffer;
+	__be32 status;
+	int err;
 	int aclsupport = 0;
 	struct nfs4_acl *acl = NULL;
 
@@ -1304,14 +1447,20 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0);
 	BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1);
 
-	status = vfs_getattr(exp->ex_mnt, dentry, &stat);
-	if (status)
+	if (exp->ex_fslocs.migrated) {
+		status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
+		if (status)
+			goto out;
+	}
+
+	err = vfs_getattr(exp->ex_mnt, dentry, &stat);
+	if (err)
 		goto out_nfserr;
 	if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL)) ||
 	    (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
 		       FATTR4_WORD1_SPACE_TOTAL))) {
-		status = vfs_statfs(dentry, &statfs);
-		if (status)
+		err = vfs_statfs(dentry, &statfs);
+		if (err)
 			goto out_nfserr;
 	}
 	if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) {
@@ -1323,18 +1472,23 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	}
 	if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT
 			| FATTR4_WORD0_SUPPORTED_ATTRS)) {
-		status = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
-		aclsupport = (status == 0);
+		err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl);
+		aclsupport = (err == 0);
 		if (bmval0 & FATTR4_WORD0_ACL) {
-			if (status == -EOPNOTSUPP)
+			if (err == -EOPNOTSUPP)
 				bmval0 &= ~FATTR4_WORD0_ACL;
-			else if (status == -EINVAL) {
+			else if (err == -EINVAL) {
 				status = nfserr_attrnotsupp;
 				goto out;
-			} else if (status != 0)
+			} else if (err != 0)
 				goto out_nfserr;
 		}
 	}
+	if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
+		if (exp->ex_fslocs.locations == NULL) {
+			bmval0 &= ~FATTR4_WORD0_FS_LOCATIONS;
+		}
+	}
 	if ((buflen -= 16) < 0)
 		goto out_resource;
 
@@ -1344,12 +1498,15 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	attrlenp = p++;                /* to be backfilled later */
 
 	if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
+		u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0;
 		if ((buflen -= 12) < 0)
 			goto out_resource;
+		if (!aclsupport)
+			word0 &= ~FATTR4_WORD0_ACL;
+		if (!exp->ex_fslocs.locations)
+			word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
 		WRITE32(2);
-		WRITE32(aclsupport ?
-			NFSD_SUPPORTED_ATTRS_WORD0 :
-			NFSD_SUPPORTED_ATTRS_WORD0 & ~FATTR4_WORD0_ACL);
+		WRITE32(word0);
 		WRITE32(NFSD_SUPPORTED_ATTRS_WORD1);
 	}
 	if (bmval0 & FATTR4_WORD0_TYPE) {
@@ -1403,7 +1560,10 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	if (bmval0 & FATTR4_WORD0_FSID) {
 		if ((buflen -= 16) < 0)
 			goto out_resource;
-		if (is_fsid(fhp, rqstp->rq_reffh)) {
+		if (exp->ex_fslocs.migrated) {
+			WRITE64(NFS4_REFERRAL_FSID_MAJOR);
+			WRITE64(NFS4_REFERRAL_FSID_MINOR);
+		} else if (is_fsid(fhp, rqstp->rq_reffh)) {
 			WRITE64((u64)exp->ex_fsid);
 			WRITE64((u64)0);
 		} else {
@@ -1426,7 +1586,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
 		if ((buflen -= 4) < 0)
 			goto out_resource;
-		WRITE32(0);
+		WRITE32(rdattr_err);
 	}
 	if (bmval0 & FATTR4_WORD0_ACL) {
 		struct nfs4_ace *ace;
@@ -1514,6 +1674,13 @@ out_acl:
 			goto out_resource;
 		WRITE64((u64) statfs.f_files);
 	}
+	if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
+		status = nfsd4_encode_fs_locations(rqstp, exp, &p, &buflen);
+		if (status == nfserr_resource)
+			goto out_resource;
+		if (status)
+			goto out;
+	}
 	if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) {
 		if ((buflen -= 4) < 0)
 			goto out_resource;
@@ -1537,12 +1704,12 @@ out_acl:
 	if (bmval0 & FATTR4_WORD0_MAXREAD) {
 		if ((buflen -= 8) < 0)
 			goto out_resource;
-		WRITE64((u64) NFSSVC_MAXBLKSIZE);
+		WRITE64((u64) svc_max_payload(rqstp));
 	}
 	if (bmval0 & FATTR4_WORD0_MAXWRITE) {
 		if ((buflen -= 8) < 0)
 			goto out_resource;
-		WRITE64((u64) NFSSVC_MAXBLKSIZE);
+		WRITE64((u64) svc_max_payload(rqstp));
 	}
 	if (bmval1 & FATTR4_WORD1_MODE) {
 		if ((buflen -= 4) < 0)
@@ -1653,7 +1820,7 @@ out:
 		fh_put(&tempfh);
 	return status;
 out_nfserr:
-	status = nfserrno(status);
+	status = nfserrno(err);
 	goto out;
 out_resource:
 	*countp = 0;
@@ -1664,13 +1831,13 @@ out_serverfault:
 	goto out;
 }
 
-static int
+static __be32
 nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
-		const char *name, int namlen, u32 *p, int *buflen)
+		const char *name, int namlen, __be32 *p, int *buflen)
 {
 	struct svc_export *exp = cd->rd_fhp->fh_export;
 	struct dentry *dentry;
-	int nfserr;
+	__be32 nfserr;
 
 	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
 	if (IS_ERR(dentry))
@@ -1699,10 +1866,10 @@ out_put:
 	return nfserr;
 }
 
-static u32 *
-nfsd4_encode_rdattr_error(u32 *p, int buflen, int nfserr)
+static __be32 *
+nfsd4_encode_rdattr_error(__be32 *p, int buflen, __be32 nfserr)
 {
-	u32 *attrlenp;
+	__be32 *attrlenp;
 
 	if (buflen < 6)
 		return NULL;
@@ -1722,8 +1889,8 @@ nfsd4_encode_dirent(struct readdir_cd *ccd, const char *name, int namlen,
 {
 	struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common);
 	int buflen;
-	u32 *p = cd->buffer;
-	int nfserr = nfserr_toosmall;
+	__be32 *p = cd->buffer;
+	__be32 nfserr = nfserr_toosmall;
 
 	/* In nfsv4, "." and ".." never make it onto the wire.. */
 	if (name && isdotent(name, namlen)) {
@@ -1779,7 +1946,7 @@ fail:
 }
 
 static void
-nfsd4_encode_access(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_access *access)
+nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
 {
 	ENCODE_HEAD;
 
@@ -1792,7 +1959,7 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_acc
 }
 
 static void
-nfsd4_encode_close(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_close *close)
+nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
 {
 	ENCODE_SEQID_OP_HEAD;
 
@@ -1807,7 +1974,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_clos
 
 
 static void
-nfsd4_encode_commit(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_commit *commit)
+nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
 {
 	ENCODE_HEAD;
 
@@ -1819,7 +1986,7 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_com
 }
 
 static void
-nfsd4_encode_create(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_create *create)
+nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
 {
 	ENCODE_HEAD;
 
@@ -1833,8 +2000,8 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_cre
 	}
 }
 
-static int
-nfsd4_encode_getattr(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_getattr *getattr)
+static __be32
+nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr)
 {
 	struct svc_fh *fhp = getattr->ga_fhp;
 	int buflen;
@@ -1846,14 +2013,13 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_ge
 	nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry,
 				    resp->p, &buflen, getattr->ga_bmval,
 				    resp->rqstp);
-
 	if (!nfserr)
 		resp->p += buflen;
 	return nfserr;
 }
 
 static void
-nfsd4_encode_getfh(struct nfsd4_compoundres *resp, int nfserr, struct svc_fh *fhp)
+nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh *fhp)
 {
 	unsigned int len;
 	ENCODE_HEAD;
@@ -1893,7 +2059,7 @@ nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denie
 }
 
 static void
-nfsd4_encode_lock(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lock *lock)
+nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
 {
 	ENCODE_SEQID_OP_HEAD;
 
@@ -1909,14 +2075,14 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lock
 }
 
 static void
-nfsd4_encode_lockt(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lockt *lockt)
+nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt)
 {
 	if (nfserr == nfserr_denied)
 		nfsd4_encode_lock_denied(resp, &lockt->lt_denied);
 }
 
 static void
-nfsd4_encode_locku(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_locku *locku)
+nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
 {
 	ENCODE_SEQID_OP_HEAD;
 
@@ -1932,7 +2098,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_lock
 
 
 static void
-nfsd4_encode_link(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_link *link)
+nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
 {
 	ENCODE_HEAD;
 
@@ -1945,7 +2111,7 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_link
 
 
 static void
-nfsd4_encode_open(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_open *open)
+nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
 {
 	ENCODE_SEQID_OP_HEAD;
 
@@ -2010,7 +2176,7 @@ out:
 }
 
 static void
-nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_open_confirm *oc)
+nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
 {
 	ENCODE_SEQID_OP_HEAD;
 				        
@@ -2025,7 +2191,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, int nfserr, struct nfs
 }
 
 static void
-nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_open_downgrade *od)
+nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
 {
 	ENCODE_SEQID_OP_HEAD;
 				        
@@ -2039,8 +2205,9 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, int nfserr, struct n
 	ENCODE_SEQID_OP_TAIL(od->od_stateowner);
 }
 
-static int
-nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read *read)
+static __be32
+nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
+		  struct nfsd4_read *read)
 {
 	u32 eof;
 	int v, pn;
@@ -2055,31 +2222,33 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read
 
 	RESERVE_SPACE(8); /* eof flag and byte count */
 
-	maxcount = NFSSVC_MAXBLKSIZE;
+	maxcount = svc_max_payload(resp->rqstp);
 	if (maxcount > read->rd_length)
 		maxcount = read->rd_length;
 
 	len = maxcount;
 	v = 0;
 	while (len > 0) {
-		pn = resp->rqstp->rq_resused;
-		svc_take_page(resp->rqstp);
-		read->rd_iov[v].iov_base = page_address(resp->rqstp->rq_respages[pn]);
-		read->rd_iov[v].iov_len = len < PAGE_SIZE ? len : PAGE_SIZE;
+		pn = resp->rqstp->rq_resused++;
+		resp->rqstp->rq_vec[v].iov_base =
+			page_address(resp->rqstp->rq_respages[pn]);
+		resp->rqstp->rq_vec[v].iov_len =
+			len < PAGE_SIZE ? len : PAGE_SIZE;
 		v++;
 		len -= PAGE_SIZE;
 	}
 	read->rd_vlen = v;
 
 	nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp,
-			read->rd_offset, read->rd_iov, read->rd_vlen,
+			read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
 			&maxcount);
 
 	if (nfserr == nfserr_symlink)
 		nfserr = nfserr_inval;
 	if (nfserr)
 		return nfserr;
-	eof = (read->rd_offset + maxcount >= read->rd_fhp->fh_dentry->d_inode->i_size);
+	eof = (read->rd_offset + maxcount >=
+	       read->rd_fhp->fh_dentry->d_inode->i_size);
 
 	WRITE32(eof);
 	WRITE32(maxcount);
@@ -2089,7 +2258,6 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read
 	resp->xbuf->page_len = maxcount;
 
 	/* Use rest of head for padding and remaining ops: */
-	resp->rqstp->rq_restailpage = 0;
 	resp->xbuf->tail[0].iov_base = p;
 	resp->xbuf->tail[0].iov_len = 0;
 	if (maxcount&3) {
@@ -2102,8 +2270,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read
 	return 0;
 }
 
-static int
-nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_readlink *readlink)
+static __be32
+nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink)
 {
 	int maxcount;
 	char *page;
@@ -2114,8 +2282,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r
 	if (resp->xbuf->page_len)
 		return nfserr_resource;
 
-	svc_take_page(resp->rqstp);
-	page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
+	page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]);
 
 	maxcount = PAGE_SIZE;
 	RESERVE_SPACE(4);
@@ -2139,7 +2306,6 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r
 	resp->xbuf->page_len = maxcount;
 
 	/* Use rest of head for padding and remaining ops: */
-	resp->rqstp->rq_restailpage = 0;
 	resp->xbuf->tail[0].iov_base = p;
 	resp->xbuf->tail[0].iov_len = 0;
 	if (maxcount&3) {
@@ -2152,12 +2318,12 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r
 	return 0;
 }
 
-static int
-nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_readdir *readdir)
+static __be32
+nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir)
 {
 	int maxcount;
 	loff_t offset;
-	u32 *page, *savep, *tailbase;
+	__be32 *page, *savep, *tailbase;
 	ENCODE_HEAD;
 
 	if (nfserr)
@@ -2190,8 +2356,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
 		goto err_no_verf;
 	}
 
-	svc_take_page(resp->rqstp);
-	page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
+	page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]);
 	readdir->common.err = 0;
 	readdir->buflen = maxcount;
 	readdir->buffer = page;
@@ -2216,10 +2381,10 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
 	p = readdir->buffer;
 	*p++ = 0;	/* no more entries */
 	*p++ = htonl(readdir->common.err == nfserr_eof);
-	resp->xbuf->page_len = ((char*)p) - (char*)page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
+	resp->xbuf->page_len = ((char*)p) - (char*)page_address(
+		resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
 
 	/* Use rest of head for padding and remaining ops: */
-	resp->rqstp->rq_restailpage = 0;
 	resp->xbuf->tail[0].iov_base = tailbase;
 	resp->xbuf->tail[0].iov_len = 0;
 	resp->p = resp->xbuf->tail[0].iov_base;
@@ -2233,7 +2398,7 @@ err_no_verf:
 }
 
 static void
-nfsd4_encode_remove(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_remove *remove)
+nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
 {
 	ENCODE_HEAD;
 
@@ -2245,7 +2410,7 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_rem
 }
 
 static void
-nfsd4_encode_rename(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_rename *rename)
+nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
 {
 	ENCODE_HEAD;
 
@@ -2262,7 +2427,7 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_ren
  * regardless of the error status.
  */
 static void
-nfsd4_encode_setattr(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_setattr *setattr)
+nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
 {
 	ENCODE_HEAD;
 
@@ -2281,7 +2446,7 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_se
 }
 
 static void
-nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_setclientid *scd)
+nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
 {
 	ENCODE_HEAD;
 
@@ -2300,7 +2465,7 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, int nfserr, struct nfsd
 }
 
 static void
-nfsd4_encode_write(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_write *write)
+nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
 {
 	ENCODE_HEAD;
 
@@ -2316,7 +2481,7 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_writ
 void
 nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 {
-	u32 *statp;
+	__be32 *statp;
 	ENCODE_HEAD;
 
 	RESERVE_SPACE(8);
@@ -2454,7 +2619,7 @@ nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
  */
 
 int
-nfs4svc_encode_voidres(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
         return xdr_ressize_check(rqstp, p);
 }
@@ -2476,9 +2641,9 @@ void nfsd4_release_compoundargs(struct nfsd4_compoundargs *args)
 }
 
 int
-nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, u32 *p, struct nfsd4_compoundargs *args)
+nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundargs *args)
 {
-	int status;
+	__be32 status;
 
 	args->p = p;
 	args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len;
@@ -2497,7 +2662,7 @@ nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, u32 *p, struct nfsd4_compoun
 }
 
 int
-nfs4svc_encode_compoundres(struct svc_rqst *rqstp, u32 *p, struct nfsd4_compoundres *resp)
+nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundres *resp)
 {
 	/*
 	 * All that remains is to write the tag and operation count...
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index fdf7cf3dfadc..6100bbe27432 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -29,7 +29,7 @@
  */
 #define CACHESIZE		1024
 #define HASHSIZE		64
-#define REQHASH(xid)		((((xid) >> 24) ^ (xid)) & (HASHSIZE-1))
+#define REQHASH(xid)		(((((__force __u32)xid) >> 24) ^ ((__force __u32)xid)) & (HASHSIZE-1))
 
 static struct hlist_head *	hash_list;
 static struct list_head 	lru_head;
@@ -127,8 +127,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp, int type)
 	struct hlist_node	*hn;
 	struct hlist_head 	*rh;
 	struct svc_cacherep	*rp;
-	u32			xid = rqstp->rq_xid,
-				proto =  rqstp->rq_prot,
+	__be32			xid = rqstp->rq_xid;
+	u32			proto =  rqstp->rq_prot,
 				vers = rqstp->rq_vers,
 				proc = rqstp->rq_proc;
 	unsigned long		age;
@@ -258,7 +258,7 @@ found_entry:
  * In this case, nfsd_cache_update is called with statp == NULL.
  */
 void
-nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, u32 *statp)
+nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
 {
 	struct svc_cacherep *rp;
 	struct kvec	*resv = &rqstp->rq_res.head[0], *cachv;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 7046ac9cf97f..39aed901514b 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -23,10 +23,14 @@
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/ctype.h>
 
 #include <linux/nfs.h>
 #include <linux/nfsd_idmap.h>
+#include <linux/lockd/bind.h>
 #include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svcsock.h>
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/cache.h>
 #include <linux/nfsd/xdr.h>
@@ -35,8 +39,6 @@
 
 #include <asm/uaccess.h>
 
-unsigned int nfsd_versbits = ~0;
-
 /*
  *	We have a single directory with 9 nodes in it.
  */
@@ -52,7 +54,10 @@ enum {
 	NFSD_List,
 	NFSD_Fh,
 	NFSD_Threads,
+	NFSD_Pool_Threads,
 	NFSD_Versions,
+	NFSD_Ports,
+	NFSD_MaxBlkSize,
 	/*
 	 * The below MUST come last.  Otherwise we leave a hole in nfsd_files[]
 	 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
@@ -75,7 +80,10 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size);
 static ssize_t write_getfs(struct file *file, char *buf, size_t size);
 static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
 static ssize_t write_threads(struct file *file, char *buf, size_t size);
+static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
 static ssize_t write_versions(struct file *file, char *buf, size_t size);
+static ssize_t write_ports(struct file *file, char *buf, size_t size);
+static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
 #ifdef CONFIG_NFSD_V4
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
@@ -91,7 +99,10 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
 	[NFSD_Getfs] = write_getfs,
 	[NFSD_Fh] = write_filehandle,
 	[NFSD_Threads] = write_threads,
+	[NFSD_Pool_Threads] = write_pool_threads,
 	[NFSD_Versions] = write_versions,
+	[NFSD_Ports] = write_ports,
+	[NFSD_MaxBlkSize] = write_maxblksize,
 #ifdef CONFIG_NFSD_V4
 	[NFSD_Leasetime] = write_leasetime,
 	[NFSD_RecoveryDir] = write_recoverydir,
@@ -358,6 +369,72 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
 	return strlen(buf);
 }
 
+extern int nfsd_nrpools(void);
+extern int nfsd_get_nrthreads(int n, int *);
+extern int nfsd_set_nrthreads(int n, int *);
+
+static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
+{
+	/* if size > 0, look for an array of number of threads per node
+	 * and apply them  then write out number of threads per node as reply
+	 */
+	char *mesg = buf;
+	int i;
+	int rv;
+	int len;
+    	int npools = nfsd_nrpools();
+	int *nthreads;
+
+	if (npools == 0) {
+		/*
+		 * NFS is shut down.  The admin can start it by
+		 * writing to the threads file but NOT the pool_threads
+		 * file, sorry.  Report zero threads.
+		 */
+		strcpy(buf, "0\n");
+		return strlen(buf);
+	}
+
+	nthreads = kcalloc(npools, sizeof(int), GFP_KERNEL);
+	if (nthreads == NULL)
+		return -ENOMEM;
+
+	if (size > 0) {
+		for (i = 0; i < npools; i++) {
+			rv = get_int(&mesg, &nthreads[i]);
+			if (rv == -ENOENT)
+				break;		/* fewer numbers than pools */
+			if (rv)
+				goto out_free;	/* syntax error */
+			rv = -EINVAL;
+			if (nthreads[i] < 0)
+				goto out_free;
+		}
+		rv = nfsd_set_nrthreads(i, nthreads);
+		if (rv)
+			goto out_free;
+	}
+
+	rv = nfsd_get_nrthreads(npools, nthreads);
+	if (rv)
+		goto out_free;
+
+	mesg = buf;
+	size = SIMPLE_TRANSACTION_LIMIT;
+	for (i = 0; i < npools && size > 0; i++) {
+		snprintf(mesg, size, "%d%c", nthreads[i], (i == npools-1 ? '\n' : ' '));
+		len = strlen(mesg);
+		size -= len;
+		mesg += len;
+	}
+
+	return (mesg-buf);
+
+out_free:
+	kfree(nthreads);
+	return rv;
+}
+
 static ssize_t write_versions(struct file *file, char *buf, size_t size)
 {
 	/*
@@ -372,6 +449,10 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size)
 
 	if (size>0) {
 		if (nfsd_serv)
+			/* Cannot change versions without updating
+			 * nfsd_serv->sv_xdrsize, and reallocing
+			 * rq_argp and rq_resp
+			 */
 			return -EBUSY;
 		if (buf[size-1] != '\n')
 			return -EINVAL;
@@ -390,10 +471,7 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size)
 			case 2:
 			case 3:
 			case 4:
-				if (sign != '-')
-					NFSCTL_VERSET(nfsd_versbits, num);
-				else
-					NFSCTL_VERUNSET(nfsd_versbits, num);
+				nfsd_vers(num, sign == '-' ? NFSD_CLEAR : NFSD_SET);
 				break;
 			default:
 				return -EINVAL;
@@ -404,16 +482,15 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size)
 		/* If all get turned off, turn them back on, as
 		 * having no versions is BAD
 		 */
-		if ((nfsd_versbits & NFSCTL_VERALL)==0)
-			nfsd_versbits = NFSCTL_VERALL;
+		nfsd_reset_versions();
 	}
 	/* Now write current state into reply buffer */
 	len = 0;
 	sep = "";
 	for (num=2 ; num <= 4 ; num++)
-		if (NFSCTL_VERISSET(NFSCTL_VERALL, num)) {
+		if (nfsd_vers(num, NFSD_AVAIL)) {
 			len += sprintf(buf+len, "%s%c%d", sep,
-				       NFSCTL_VERISSET(nfsd_versbits, num)?'+':'-',
+				       nfsd_vers(num, NFSD_TEST)?'+':'-',
 				       num);
 			sep = " ";
 		}
@@ -421,6 +498,95 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size)
 	return len;
 }
 
+static ssize_t write_ports(struct file *file, char *buf, size_t size)
+{
+	if (size == 0) {
+		int len = 0;
+		lock_kernel();
+		if (nfsd_serv)
+			len = svc_sock_names(buf, nfsd_serv, NULL);
+		unlock_kernel();
+		return len;
+	}
+	/* Either a single 'fd' number is written, in which
+	 * case it must be for a socket of a supported family/protocol,
+	 * and we use it as an nfsd socket, or
+	 * A '-' followed by the 'name' of a socket in which case
+	 * we close the socket.
+	 */
+	if (isdigit(buf[0])) {
+		char *mesg = buf;
+		int fd;
+		int err;
+		err = get_int(&mesg, &fd);
+		if (err)
+			return -EINVAL;
+		if (fd < 0)
+			return -EINVAL;
+		err = nfsd_create_serv();
+		if (!err) {
+			int proto = 0;
+			err = svc_addsock(nfsd_serv, fd, buf, &proto);
+			if (err >= 0) {
+				err = lockd_up(proto);
+				if (err < 0)
+					svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf);
+			}
+			/* Decrease the count, but don't shutdown the
+			 * the service
+			 */
+			lock_kernel();
+			nfsd_serv->sv_nrthreads--;
+			unlock_kernel();
+		}
+		return err < 0 ? err : 0;
+	}
+	if (buf[0] == '-') {
+		char *toclose = kstrdup(buf+1, GFP_KERNEL);
+		int len = 0;
+		if (!toclose)
+			return -ENOMEM;
+		lock_kernel();
+		if (nfsd_serv)
+			len = svc_sock_names(buf, nfsd_serv, toclose);
+		unlock_kernel();
+		if (len >= 0)
+			lockd_down();
+		kfree(toclose);
+		return len;
+	}
+	return -EINVAL;
+}
+
+int nfsd_max_blksize;
+
+static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
+{
+	char *mesg = buf;
+	if (size > 0) {
+		int bsize;
+		int rv = get_int(&mesg, &bsize);
+		if (rv)
+			return rv;
+		/* force bsize into allowed range and
+		 * required alignment.
+		 */
+		if (bsize < 1024)
+			bsize = 1024;
+		if (bsize > NFSSVC_MAXBLKSIZE)
+			bsize = NFSSVC_MAXBLKSIZE;
+		bsize &= ~(1024-1);
+		lock_kernel();
+		if (nfsd_serv && nfsd_serv->sv_nrthreads) {
+			unlock_kernel();
+			return -EBUSY;
+		}
+		nfsd_max_blksize = bsize;
+		unlock_kernel();
+	}
+	return sprintf(buf, "%d\n", nfsd_max_blksize);
+}
+
 #ifdef CONFIG_NFSD_V4
 extern time_t nfs4_leasetime(void);
 
@@ -483,7 +649,10 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 		[NFSD_List] = {"exports", &exports_operations, S_IRUGO},
 		[NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
+		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
 #ifdef CONFIG_NFSD_V4
 		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 501d83884530..727ab3bd450d 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -76,7 +76,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
  * comment in the NFSv3 spec says this is incorrect (implementation notes for
  * the write call).
  */
-static inline int
+static inline __be32
 nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
 {
 	/* Type can be negative when creating hardlinks - not to a dir */
@@ -110,13 +110,13 @@ nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
  * This is only called at the start of an nfsproc call, so fhp points to
  * a svc_fh which is all 0 except for the over-the-wire file handle.
  */
-u32
+__be32
 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
 {
 	struct knfsd_fh	*fh = &fhp->fh_handle;
 	struct svc_export *exp = NULL;
 	struct dentry	*dentry;
-	u32		error = 0;
+	__be32		error = 0;
 
 	dprintk("nfsd: fh_verify(%s)\n", SVCFH_fmt(fhp));
 
@@ -315,7 +315,7 @@ static inline void _fh_update_old(struct dentry *dentry,
 		fh->ofh_dirino = 0;
 }
 
-int
+__be32
 fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, struct svc_fh *ref_fh)
 {
 	/* ref_fh is a reference file handle.
@@ -451,7 +451,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, st
  * Update file handle information after changing a dentry.
  * This is only called by nfsd_create, nfsd_create_v3 and nfsd_proc_create
  */
-int
+__be32
 fh_update(struct svc_fh *fhp)
 {
 	struct dentry *dentry;
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 06cd0db0f32b..ec983b777680 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -30,22 +30,22 @@ typedef struct svc_buf	svc_buf;
 #define NFSDDBG_FACILITY		NFSDDBG_PROC
 
 
-static int
+static __be32
 nfsd_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
 {
 	return nfs_ok;
 }
 
-static int
-nfsd_return_attrs(int err, struct nfsd_attrstat *resp)
+static __be32
+nfsd_return_attrs(__be32 err, struct nfsd_attrstat *resp)
 {
 	if (err) return err;
 	return nfserrno(vfs_getattr(resp->fh.fh_export->ex_mnt,
 				    resp->fh.fh_dentry,
 				    &resp->stat));
 }
-static int
-nfsd_return_dirop(int err, struct nfsd_diropres *resp)
+static __be32
+nfsd_return_dirop(__be32 err, struct nfsd_diropres *resp)
 {
 	if (err) return err;
 	return nfserrno(vfs_getattr(resp->fh.fh_export->ex_mnt,
@@ -56,11 +56,11 @@ nfsd_return_dirop(int err, struct nfsd_diropres *resp)
  * Get a file's attributes
  * N.B. After this call resp->fh needs an fh_put
  */
-static int
+static __be32
 nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
 					  struct nfsd_attrstat *resp)
 {
-	int nfserr;
+	__be32 nfserr;
 	dprintk("nfsd: GETATTR  %s\n", SVCFH_fmt(&argp->fh));
 
 	fh_copy(&resp->fh, &argp->fh);
@@ -72,11 +72,11 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle  *argp,
  * Set a file's attributes
  * N.B. After this call resp->fh needs an fh_put
  */
-static int
+static __be32
 nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp,
 					  struct nfsd_attrstat  *resp)
 {
-	int nfserr;
+	__be32 nfserr;
 	dprintk("nfsd: SETATTR  %s, valid=%x, size=%ld\n",
 		SVCFH_fmt(&argp->fh),
 		argp->attrs.ia_valid, (long) argp->attrs.ia_size);
@@ -92,11 +92,11 @@ nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp,
  * doesn't exist yet.
  * N.B. After this call resp->fh needs an fh_put
  */
-static int
+static __be32
 nfsd_proc_lookup(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
 					 struct nfsd_diropres  *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: LOOKUP   %s %.*s\n",
 		SVCFH_fmt(&argp->fh), argp->len, argp->name);
@@ -112,11 +112,11 @@ nfsd_proc_lookup(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
 /*
  * Read a symlink.
  */
-static int
+static __be32
 nfsd_proc_readlink(struct svc_rqst *rqstp, struct nfsd_readlinkargs *argp,
 					   struct nfsd_readlinkres *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh));
 
@@ -132,11 +132,11 @@ nfsd_proc_readlink(struct svc_rqst *rqstp, struct nfsd_readlinkargs *argp,
  * Read a portion of a file.
  * N.B. After this call resp->fh needs an fh_put
  */
-static int
+static __be32
 nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
 				       struct nfsd_readres  *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: READ    %s %d bytes at %d\n",
 		SVCFH_fmt(&argp->fh),
@@ -146,20 +146,20 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
 	 * status, 17 words for fattr, and 1 word for the byte count.
 	 */
 
-	if (NFSSVC_MAXBLKSIZE < argp->count) {
+	if (NFSSVC_MAXBLKSIZE_V2 < argp->count) {
 		printk(KERN_NOTICE
 			"oversized read request from %u.%u.%u.%u:%d (%d bytes)\n",
 				NIPQUAD(rqstp->rq_addr.sin_addr.s_addr),
 				ntohs(rqstp->rq_addr.sin_port),
 				argp->count);
-		argp->count = NFSSVC_MAXBLKSIZE;
+		argp->count = NFSSVC_MAXBLKSIZE_V2;
 	}
 	svc_reserve(rqstp, (19<<2) + argp->count + 4);
 
 	resp->count = argp->count;
 	nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
 				  argp->offset,
-			   	  argp->vec, argp->vlen,
+			   	  rqstp->rq_vec, argp->vlen,
 				  &resp->count);
 
 	if (nfserr) return nfserr;
@@ -172,11 +172,11 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
  * Write data to a file
  * N.B. After this call resp->fh needs an fh_put
  */
-static int
+static __be32
 nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
 					struct nfsd_attrstat  *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 	int	stable = 1;
 
 	dprintk("nfsd: WRITE    %s %d bytes at %d\n",
@@ -185,7 +185,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
 
 	nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
 				   argp->offset,
-				   argp->vec, argp->vlen,
+				   rqstp->rq_vec, argp->vlen,
 				   argp->len,
 				   &stable);
 	return nfsd_return_attrs(nfserr, resp);
@@ -197,7 +197,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
  * and the actual create() call in compliance with VFS protocols.
  * N.B. After this call _both_ argp->fh and resp->fh need an fh_put
  */
-static int
+static __be32
 nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
 					 struct nfsd_diropres   *resp)
 {
@@ -206,7 +206,8 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
 	struct iattr	*attr = &argp->attrs;
 	struct inode	*inode;
 	struct dentry	*dchild;
-	int		nfserr, type, mode;
+	int		type, mode;
+	__be32		nfserr;
 	dev_t		rdev = 0, wanted = new_decode_dev(attr->ia_size);
 
 	dprintk("nfsd: CREATE   %s %.*s\n",
@@ -225,7 +226,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
 	nfserr = nfserr_exist;
 	if (isdotent(argp->name, argp->len))
 		goto done;
-	fh_lock(dirfhp);
+	fh_lock_nested(dirfhp, I_MUTEX_PARENT);
 	dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len);
 	if (IS_ERR(dchild)) {
 		nfserr = nfserrno(PTR_ERR(dchild));
@@ -348,11 +349,11 @@ done:
 	return nfsd_return_dirop(nfserr, resp);
 }
 
-static int
+static __be32
 nfsd_proc_remove(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
 					 void		       *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: REMOVE   %s %.*s\n", SVCFH_fmt(&argp->fh),
 		argp->len, argp->name);
@@ -363,11 +364,11 @@ nfsd_proc_remove(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
 	return nfserr;
 }
 
-static int
+static __be32
 nfsd_proc_rename(struct svc_rqst *rqstp, struct nfsd_renameargs *argp,
 				  	 void		        *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: RENAME   %s %.*s -> \n",
 		SVCFH_fmt(&argp->ffh), argp->flen, argp->fname);
@@ -381,11 +382,11 @@ nfsd_proc_rename(struct svc_rqst *rqstp, struct nfsd_renameargs *argp,
 	return nfserr;
 }
 
-static int
+static __be32
 nfsd_proc_link(struct svc_rqst *rqstp, struct nfsd_linkargs *argp,
 				void			    *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: LINK     %s ->\n",
 		SVCFH_fmt(&argp->ffh));
@@ -401,12 +402,12 @@ nfsd_proc_link(struct svc_rqst *rqstp, struct nfsd_linkargs *argp,
 	return nfserr;
 }
 
-static int
+static __be32
 nfsd_proc_symlink(struct svc_rqst *rqstp, struct nfsd_symlinkargs *argp,
 				          void			  *resp)
 {
 	struct svc_fh	newfh;
-	int		nfserr;
+	__be32		nfserr;
 
 	dprintk("nfsd: SYMLINK  %s %.*s -> %.*s\n",
 		SVCFH_fmt(&argp->ffh), argp->flen, argp->fname,
@@ -430,11 +431,11 @@ nfsd_proc_symlink(struct svc_rqst *rqstp, struct nfsd_symlinkargs *argp,
  * Make directory. This operation is not idempotent.
  * N.B. After this call resp->fh needs an fh_put
  */
-static int
+static __be32
 nfsd_proc_mkdir(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
 					struct nfsd_diropres   *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: MKDIR    %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
 
@@ -454,11 +455,11 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
 /*
  * Remove a directory
  */
-static int
+static __be32
 nfsd_proc_rmdir(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
 				 	void		      *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: RMDIR    %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
 
@@ -470,11 +471,12 @@ nfsd_proc_rmdir(struct svc_rqst *rqstp, struct nfsd_diropargs *argp,
 /*
  * Read a portion of a directory.
  */
-static int
+static __be32
 nfsd_proc_readdir(struct svc_rqst *rqstp, struct nfsd_readdirargs *argp,
 					  struct nfsd_readdirres  *resp)
 {
-	int		nfserr, count;
+	int		count;
+	__be32		nfserr;
 	loff_t		offset;
 
 	dprintk("nfsd: READDIR  %s %d bytes at %d\n",
@@ -509,11 +511,11 @@ nfsd_proc_readdir(struct svc_rqst *rqstp, struct nfsd_readdirargs *argp,
 /*
  * Get file system info
  */
-static int
+static __be32
 nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle   *argp,
 					  struct nfsd_statfsres *resp)
 {
-	int	nfserr;
+	__be32	nfserr;
 
 	dprintk("nfsd: STATFS   %s\n", SVCFH_fmt(&argp->fh));
 
@@ -553,7 +555,7 @@ static struct svc_procedure		nfsd_procedures2[18] = {
   PROC(none,	 void,		void,		none,		RC_NOCACHE, ST),
   PROC(lookup,	 diropargs,	diropres,	fhandle,	RC_NOCACHE, ST+FH+AT),
   PROC(readlink, readlinkargs,	readlinkres,	none,		RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4),
-  PROC(read,	 readargs,	readres,	fhandle,	RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE/4),
+  PROC(read,	 readargs,	readres,	fhandle,	RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4),
   PROC(none,	 void,		void,		none,		RC_NOCACHE, ST),
   PROC(write,	 writeargs,	attrstat,	fhandle,	RC_REPLBUFF, ST+AT),
   PROC(create,	 createargs,	diropres,	fhandle,	RC_REPLBUFF, ST+FH+AT),
@@ -579,11 +581,11 @@ struct svc_version	nfsd_version2 = {
 /*
  * Map errnos to NFS errnos.
  */
-int
+__be32
 nfserrno (int errno)
 {
 	static struct {
-		int	nfserr;
+		__be32	nfserr;
 		int	syserr;
 	} nfs_errtbl[] = {
 		{ nfs_ok, 0 },
@@ -615,11 +617,10 @@ nfserrno (int errno)
 		{ nfserr_badname, -ESRCH },
 		{ nfserr_io, -ETXTBSY },
 		{ nfserr_notsupp, -EOPNOTSUPP },
-		{ -1, -EIO }
 	};
 	int	i;
 
-	for (i = 0; nfs_errtbl[i].nfserr != -1; i++) {
+	for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) {
 		if (nfs_errtbl[i].syserr == errno)
 			return nfs_errtbl[i].nfserr;
 	}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index ec1decf29bab..0aaccb03bf76 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -57,12 +57,6 @@ static atomic_t			nfsd_busy;
 static unsigned long		nfsd_last_call;
 static DEFINE_SPINLOCK(nfsd_call_lock);
 
-struct nfsd_list {
-	struct list_head 	list;
-	struct task_struct	*task;
-};
-static struct list_head nfsd_list = LIST_HEAD_INIT(nfsd_list);
-
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
 static struct svc_stat	nfsd_acl_svcstats;
 static struct svc_version *	nfsd_acl_version[] = {
@@ -117,6 +111,32 @@ struct svc_program		nfsd_program = {
 
 };
 
+int nfsd_vers(int vers, enum vers_op change)
+{
+	if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
+		return -1;
+	switch(change) {
+	case NFSD_SET:
+		nfsd_versions[vers] = nfsd_version[vers];
+		break;
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+		if (vers < NFSD_ACL_NRVERS)
+			nfsd_acl_version[vers] = nfsd_acl_version[vers];
+#endif
+	case NFSD_CLEAR:
+		nfsd_versions[vers] = NULL;
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+		if (vers < NFSD_ACL_NRVERS)
+			nfsd_acl_version[vers] = NULL;
+#endif
+		break;
+	case NFSD_TEST:
+		return nfsd_versions[vers] != NULL;
+	case NFSD_AVAIL:
+		return nfsd_version[vers] != NULL;
+	}
+	return 0;
+}
 /*
  * Maximum number of nfsd processes
  */
@@ -130,16 +150,192 @@ int nfsd_nrthreads(void)
 		return nfsd_serv->sv_nrthreads;
 }
 
+static int killsig;	/* signal that was used to kill last nfsd */
+static void nfsd_last_thread(struct svc_serv *serv)
+{
+	/* When last nfsd thread exits we need to do some clean-up */
+	struct svc_sock *svsk;
+	list_for_each_entry(svsk, &serv->sv_permsocks, sk_list)
+		lockd_down();
+	nfsd_serv = NULL;
+	nfsd_racache_shutdown();
+	nfs4_state_shutdown();
+
+	printk(KERN_WARNING "nfsd: last server has exited\n");
+	if (killsig != SIG_NOCLEAN) {
+		printk(KERN_WARNING "nfsd: unexporting all filesystems\n");
+		nfsd_export_flush();
+	}
+}
+
+void nfsd_reset_versions(void)
+{
+	int found_one = 0;
+	int i;
+
+	for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) {
+		if (nfsd_program.pg_vers[i])
+			found_one = 1;
+	}
+
+	if (!found_one) {
+		for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++)
+			nfsd_program.pg_vers[i] = nfsd_version[i];
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+		for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++)
+			nfsd_acl_program.pg_vers[i] =
+				nfsd_acl_version[i];
+#endif
+	}
+}
+
+int nfsd_create_serv(void)
+{
+	int err = 0;
+	lock_kernel();
+	if (nfsd_serv) {
+		svc_get(nfsd_serv);
+		unlock_kernel();
+		return 0;
+	}
+	if (nfsd_max_blksize == 0) {
+		/* choose a suitable default */
+		struct sysinfo i;
+		si_meminfo(&i);
+		/* Aim for 1/4096 of memory per thread
+		 * This gives 1MB on 4Gig machines
+		 * But only uses 32K on 128M machines.
+		 * Bottom out at 8K on 32M and smaller.
+		 * Of course, this is only a default.
+		 */
+		nfsd_max_blksize = NFSSVC_MAXBLKSIZE;
+		i.totalram <<= PAGE_SHIFT - 12;
+		while (nfsd_max_blksize > i.totalram &&
+		       nfsd_max_blksize >= 8*1024*2)
+			nfsd_max_blksize /= 2;
+	}
+
+	atomic_set(&nfsd_busy, 0);
+	nfsd_serv = svc_create_pooled(&nfsd_program,
+				      nfsd_max_blksize,
+				      nfsd_last_thread,
+				      nfsd, SIG_NOCLEAN, THIS_MODULE);
+	if (nfsd_serv == NULL)
+		err = -ENOMEM;
+	unlock_kernel();
+	do_gettimeofday(&nfssvc_boot);		/* record boot time */
+	return err;
+}
+
+static int nfsd_init_socks(int port)
+{
+	int error;
+	if (!list_empty(&nfsd_serv->sv_permsocks))
+		return 0;
+
+	error = lockd_up(IPPROTO_UDP);
+	if (error >= 0) {
+		error = svc_makesock(nfsd_serv, IPPROTO_UDP, port);
+		if (error < 0)
+			lockd_down();
+	}
+	if (error < 0)
+		return error;
+
+#ifdef CONFIG_NFSD_TCP
+	error = lockd_up(IPPROTO_TCP);
+	if (error >= 0) {
+		error = svc_makesock(nfsd_serv, IPPROTO_TCP, port);
+		if (error < 0)
+			lockd_down();
+	}
+	if (error < 0)
+		return error;
+#endif
+	return 0;
+}
+
+int nfsd_nrpools(void)
+{
+	if (nfsd_serv == NULL)
+		return 0;
+	else
+		return nfsd_serv->sv_nrpools;
+}
+
+int nfsd_get_nrthreads(int n, int *nthreads)
+{
+	int i = 0;
+
+	if (nfsd_serv != NULL) {
+		for (i = 0; i < nfsd_serv->sv_nrpools && i < n; i++)
+			nthreads[i] = nfsd_serv->sv_pools[i].sp_nrthreads;
+	}
+
+	return 0;
+}
+
+int nfsd_set_nrthreads(int n, int *nthreads)
+{
+	int i = 0;
+	int tot = 0;
+	int err = 0;
+
+	if (nfsd_serv == NULL || n <= 0)
+		return 0;
+
+	if (n > nfsd_serv->sv_nrpools)
+		n = nfsd_serv->sv_nrpools;
+
+	/* enforce a global maximum number of threads */
+	tot = 0;
+	for (i = 0; i < n; i++) {
+		if (nthreads[i] > NFSD_MAXSERVS)
+			nthreads[i] = NFSD_MAXSERVS;
+		tot += nthreads[i];
+	}
+	if (tot > NFSD_MAXSERVS) {
+		/* total too large: scale down requested numbers */
+		for (i = 0; i < n && tot > 0; i++) {
+		    	int new = nthreads[i] * NFSD_MAXSERVS / tot;
+			tot -= (nthreads[i] - new);
+			nthreads[i] = new;
+		}
+		for (i = 0; i < n && tot > 0; i++) {
+			nthreads[i]--;
+			tot--;
+		}
+	}
+
+	/*
+	 * There must always be a thread in pool 0; the admin
+	 * can't shut down NFS completely using pool_threads.
+	 */
+	if (nthreads[0] == 0)
+		nthreads[0] = 1;
+
+	/* apply the new numbers */
+	lock_kernel();
+	svc_get(nfsd_serv);
+	for (i = 0; i < n; i++) {
+		err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i],
+				    	  nthreads[i]);
+		if (err)
+			break;
+	}
+	svc_destroy(nfsd_serv);
+	unlock_kernel();
+
+	return err;
+}
+
 int
 nfsd_svc(unsigned short port, int nrservs)
 {
 	int	error;
-	int	none_left, found_one, i;
-	struct list_head *victim;
 	
 	lock_kernel();
-	dprintk("nfsd: creating service: vers 0x%x\n",
-		nfsd_versbits);
+	dprintk("nfsd: creating service\n");
 	error = -EINVAL;
 	if (nrservs <= 0)
 		nrservs = 0;
@@ -153,91 +349,20 @@ nfsd_svc(unsigned short port, int nrservs)
 	error = nfs4_state_start();
 	if (error<0)
 		goto out;
-	if (!nfsd_serv) {
-		/*
-		 * Use the nfsd_ctlbits to define which
-		 * versions that will be advertised.
-		 * If nfsd_ctlbits doesn't list any version,
-		 * export them all.
-		 */
-		found_one = 0;
-
-		for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) {
-			if (NFSCTL_VERISSET(nfsd_versbits, i)) {
-				nfsd_program.pg_vers[i] = nfsd_version[i];
-				found_one = 1;
-			} else
-				nfsd_program.pg_vers[i] = NULL;
-		}
 
-		if (!found_one) {
-			for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++)
-				nfsd_program.pg_vers[i] = nfsd_version[i];
-		}
+	nfsd_reset_versions();
 
+	error = nfsd_create_serv();
 
-#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
-		found_one = 0;
-
-		for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) {
-			if (NFSCTL_VERISSET(nfsd_versbits, i)) {
-				nfsd_acl_program.pg_vers[i] =
-					nfsd_acl_version[i];
-				found_one = 1;
-			} else
-				nfsd_acl_program.pg_vers[i] = NULL;
-		}
-
-		if (!found_one) {
-			for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++)
-				nfsd_acl_program.pg_vers[i] =
-					nfsd_acl_version[i];
-		}
-#endif
-
-		atomic_set(&nfsd_busy, 0);
-		error = -ENOMEM;
-		nfsd_serv = svc_create(&nfsd_program, NFSD_BUFSIZE);
-		if (nfsd_serv == NULL)
-			goto out;
-		error = svc_makesock(nfsd_serv, IPPROTO_UDP, port);
-		if (error < 0)
-			goto failure;
+	if (error)
+		goto out;
+	error = nfsd_init_socks(port);
+	if (error)
+		goto failure;
 
-#ifdef CONFIG_NFSD_TCP
-		error = svc_makesock(nfsd_serv, IPPROTO_TCP, port);
-		if (error < 0)
-			goto failure;
-#endif
-		do_gettimeofday(&nfssvc_boot);		/* record boot time */
-	} else
-		nfsd_serv->sv_nrthreads++;
-	nrservs -= (nfsd_serv->sv_nrthreads-1);
-	while (nrservs > 0) {
-		nrservs--;
-		__module_get(THIS_MODULE);
-		error = svc_create_thread(nfsd, nfsd_serv);
-		if (error < 0) {
-			module_put(THIS_MODULE);
-			break;
-		}
-	}
-	victim = nfsd_list.next;
-	while (nrservs < 0 && victim != &nfsd_list) {
-		struct nfsd_list *nl =
-			list_entry(victim,struct nfsd_list, list);
-		victim = victim->next;
-		send_sig(SIG_NOCLEAN, nl->task, 1);
-		nrservs++;
-	}
+	error = svc_set_num_threads(nfsd_serv, NULL, nrservs);
  failure:
-	none_left = (nfsd_serv->sv_nrthreads == 1);
 	svc_destroy(nfsd_serv);		/* Release server */
-	if (none_left) {
-		nfsd_serv = NULL;
-		nfsd_racache_shutdown();
-		nfs4_state_shutdown();
-	}
  out:
 	unlock_kernel();
 	return error;
@@ -270,10 +395,8 @@ update_thread_usage(int busy_threads)
 static void
 nfsd(struct svc_rqst *rqstp)
 {
-	struct svc_serv	*serv = rqstp->rq_server;
 	struct fs_struct *fsp;
 	int		err;
-	struct nfsd_list me;
 	sigset_t shutdown_mask, allowed_mask;
 
 	/* Lock module and set up kernel thread */
@@ -297,10 +420,7 @@ nfsd(struct svc_rqst *rqstp)
 
 	nfsdstats.th_cnt++;
 
-	lockd_up();				/* start lockd */
-
-	me.task = current;
-	list_add(&me.list, &nfsd_list);
+	rqstp->rq_task = current;
 
 	unlock_kernel();
 
@@ -322,8 +442,7 @@ nfsd(struct svc_rqst *rqstp)
 		 * Find a socket with data available and call its
 		 * recvfrom routine.
 		 */
-		while ((err = svc_recv(serv, rqstp,
-				       60*60*HZ)) == -EAGAIN)
+		while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN)
 			;
 		if (err < 0)
 			break;
@@ -336,7 +455,7 @@ nfsd(struct svc_rqst *rqstp)
 		/* Process request with signals blocked.  */
 		sigprocmask(SIG_SETMASK, &allowed_mask, NULL);
 
-		svc_process(serv, rqstp);
+		svc_process(rqstp);
 
 		/* Unlock export hash tables */
 		exp_readunlock();
@@ -353,29 +472,13 @@ nfsd(struct svc_rqst *rqstp)
 			if (sigismember(&current->pending.signal, signo) &&
 			    !sigismember(&current->blocked, signo))
 				break;
-		err = signo;
+		killsig = signo;
 	}
-	/* Clear signals before calling lockd_down() and svc_exit_thread() */
+	/* Clear signals before calling svc_exit_thread() */
 	flush_signals(current);
 
 	lock_kernel();
 
-	/* Release lockd */
-	lockd_down();
-
-	/* Check if this is last thread */
-	if (serv->sv_nrthreads==1) {
-		
-		printk(KERN_WARNING "nfsd: last server has exited\n");
-		if (err != SIG_NOCLEAN) {
-			printk(KERN_WARNING "nfsd: unexporting all filesystems\n");
-			nfsd_export_flush();
-		}
-		nfsd_serv = NULL;
-	        nfsd_racache_shutdown();	/* release read-ahead cache */
-		nfs4_state_shutdown();
-	}
-	list_del(&me.list);
 	nfsdstats.th_cnt --;
 
 out:
@@ -388,12 +491,12 @@ out:
 }
 
 int
-nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp)
+nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 {
 	struct svc_procedure	*proc;
 	kxdrproc_t		xdr;
-	u32			nfserr;
-	u32			*nfserrp;
+	__be32			nfserr;
+	__be32			*nfserrp;
 
 	dprintk("nfsd_dispatch: vers %d proc %d\n",
 				rqstp->rq_vers, rqstp->rq_proc);
@@ -412,7 +515,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp)
 
 	/* Decode arguments */
 	xdr = proc->pc_decode;
-	if (xdr && !xdr(rqstp, (u32*)rqstp->rq_arg.head[0].iov_base,
+	if (xdr && !xdr(rqstp, (__be32*)rqstp->rq_arg.head[0].iov_base,
 			rqstp->rq_argp)) {
 		dprintk("nfsd: failed to decode arguments!\n");
 		nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
@@ -425,7 +528,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp)
 	 */
 	nfserrp = rqstp->rq_res.head[0].iov_base
 		+ rqstp->rq_res.head[0].iov_len;
-	rqstp->rq_res.head[0].iov_len += sizeof(u32);
+	rqstp->rq_res.head[0].iov_len += sizeof(__be32);
 
 	/* Now call the procedure handler, and encode NFS status. */
 	nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index e3a0797dd56b..56ebb1443e0e 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nfsd/xdr.c
+ * linux/fs/nfsd/nfsxdr.c
  *
  * XDR support for nfsd
  *
@@ -37,8 +37,8 @@ static u32	nfs_ftypes[] = {
 /*
  * XDR functions for basic NFS types
  */
-static u32 *
-decode_fh(u32 *p, struct svc_fh *fhp)
+static __be32 *
+decode_fh(__be32 *p, struct svc_fh *fhp)
 {
 	fh_init(fhp, NFS_FHSIZE);
 	memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE);
@@ -50,13 +50,13 @@ decode_fh(u32 *p, struct svc_fh *fhp)
 }
 
 /* Helper function for NFSv2 ACL code */
-u32 *nfs2svc_decode_fh(u32 *p, struct svc_fh *fhp)
+__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp)
 {
 	return decode_fh(p, fhp);
 }
 
-static inline u32 *
-encode_fh(u32 *p, struct svc_fh *fhp)
+static inline __be32 *
+encode_fh(__be32 *p, struct svc_fh *fhp)
 {
 	memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE);
 	return p + (NFS_FHSIZE>> 2);
@@ -66,8 +66,8 @@ encode_fh(u32 *p, struct svc_fh *fhp)
  * Decode a file name and make sure that the path contains
  * no slashes or null bytes.
  */
-static inline u32 *
-decode_filename(u32 *p, char **namp, int *lenp)
+static inline __be32 *
+decode_filename(__be32 *p, char **namp, int *lenp)
 {
 	char		*name;
 	int		i;
@@ -82,8 +82,8 @@ decode_filename(u32 *p, char **namp, int *lenp)
 	return p;
 }
 
-static inline u32 *
-decode_pathname(u32 *p, char **namp, int *lenp)
+static inline __be32 *
+decode_pathname(__be32 *p, char **namp, int *lenp)
 {
 	char		*name;
 	int		i;
@@ -98,8 +98,8 @@ decode_pathname(u32 *p, char **namp, int *lenp)
 	return p;
 }
 
-static inline u32 *
-decode_sattr(u32 *p, struct iattr *iap)
+static inline __be32 *
+decode_sattr(__be32 *p, struct iattr *iap)
 {
 	u32	tmp, tmp1;
 
@@ -151,8 +151,8 @@ decode_sattr(u32 *p, struct iattr *iap)
 	return p;
 }
 
-static u32 *
-encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
+static __be32 *
+encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
 	     struct kstat *stat)
 {
 	struct dentry	*dentry = fhp->fh_dentry;
@@ -195,7 +195,7 @@ encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp,
 }
 
 /* Helper function for NFSv2 ACL code */
-u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
 {
 	struct kstat stat;
 	vfs_getattr(fhp->fh_export->ex_mnt, fhp->fh_dentry, &stat);
@@ -206,13 +206,13 @@ u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
  * XDR decode functions
  */
 int
-nfssvc_decode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
 	return xdr_argsize_check(rqstp, p);
 }
 
 int
-nfssvc_decode_fhandle(struct svc_rqst *rqstp, u32 *p, struct nfsd_fhandle *args)
+nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args)
 {
 	if (!(p = decode_fh(p, &args->fh)))
 		return 0;
@@ -220,7 +220,7 @@ nfssvc_decode_fhandle(struct svc_rqst *rqstp, u32 *p, struct nfsd_fhandle *args)
 }
 
 int
-nfssvc_decode_sattrargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_sattrargs *args)
 {
 	if (!(p = decode_fh(p, &args->fh))
@@ -231,7 +231,7 @@ nfssvc_decode_sattrargs(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfssvc_decode_diropargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_diropargs *args)
 {
 	if (!(p = decode_fh(p, &args->fh))
@@ -242,7 +242,7 @@ nfssvc_decode_diropargs(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfssvc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_readargs *args)
 {
 	unsigned int len;
@@ -254,19 +254,18 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
 	len = args->count     = ntohl(*p++);
 	p++; /* totalcount - unused */
 
-	if (len > NFSSVC_MAXBLKSIZE)
-		len = NFSSVC_MAXBLKSIZE;
+	if (len > NFSSVC_MAXBLKSIZE_V2)
+		len = NFSSVC_MAXBLKSIZE_V2;
 
 	/* set up somewhere to store response.
 	 * We take pages, put them on reslist and include in iovec
 	 */
 	v=0;
 	while (len > 0) {
-		pn=rqstp->rq_resused;
-		svc_take_page(rqstp);
-		args->vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
-		args->vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE;
-		len -= args->vec[v].iov_len;
+		pn = rqstp->rq_resused++;
+		rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
+		rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE;
+		len -= rqstp->rq_vec[v].iov_len;
 		v++;
 	}
 	args->vlen = v;
@@ -274,7 +273,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfssvc_decode_writeargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_writeargs *args)
 {
 	unsigned int len;
@@ -286,25 +285,25 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, u32 *p,
 	args->offset = ntohl(*p++);	/* offset */
 	p++;				/* totalcount */
 	len = args->len = ntohl(*p++);
-	args->vec[0].iov_base = (void*)p;
-	args->vec[0].iov_len = rqstp->rq_arg.head[0].iov_len -
+	rqstp->rq_vec[0].iov_base = (void*)p;
+	rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len -
 				(((void*)p) - rqstp->rq_arg.head[0].iov_base);
-	if (len > NFSSVC_MAXBLKSIZE)
-		len = NFSSVC_MAXBLKSIZE;
+	if (len > NFSSVC_MAXBLKSIZE_V2)
+		len = NFSSVC_MAXBLKSIZE_V2;
 	v = 0;
-	while (len > args->vec[v].iov_len) {
-		len -= args->vec[v].iov_len;
+	while (len > rqstp->rq_vec[v].iov_len) {
+		len -= rqstp->rq_vec[v].iov_len;
 		v++;
-		args->vec[v].iov_base = page_address(rqstp->rq_argpages[v]);
-		args->vec[v].iov_len = PAGE_SIZE;
+		rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]);
+		rqstp->rq_vec[v].iov_len = PAGE_SIZE;
 	}
-	args->vec[v].iov_len = len;
+	rqstp->rq_vec[v].iov_len = len;
 	args->vlen = v+1;
-	return args->vec[0].iov_len > 0;
+	return rqstp->rq_vec[0].iov_len > 0;
 }
 
 int
-nfssvc_decode_createargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_createargs *args)
 {
 	if (!(p = decode_fh(p, &args->fh))
@@ -316,7 +315,7 @@ nfssvc_decode_createargs(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfssvc_decode_renameargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_renameargs *args)
 {
 	if (!(p = decode_fh(p, &args->ffh))
@@ -329,18 +328,17 @@ nfssvc_decode_renameargs(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, u32 *p, struct nfsd_readlinkargs *args)
+nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readlinkargs *args)
 {
 	if (!(p = decode_fh(p, &args->fh)))
 		return 0;
-	svc_take_page(rqstp);
-	args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]);
+	args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]);
 
 	return xdr_argsize_check(rqstp, p);
 }
 
 int
-nfssvc_decode_linkargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_linkargs *args)
 {
 	if (!(p = decode_fh(p, &args->ffh))
@@ -352,7 +350,7 @@ nfssvc_decode_linkargs(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_symlinkargs *args)
 {
 	if (!(p = decode_fh(p, &args->ffh))
@@ -365,7 +363,7 @@ nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfssvc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p,
+nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_readdirargs *args)
 {
 	if (!(p = decode_fh(p, &args->fh)))
@@ -375,8 +373,7 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p,
 	if (args->count > PAGE_SIZE)
 		args->count = PAGE_SIZE;
 
-	svc_take_page(rqstp);
-	args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]);
+	args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]);
 
 	return xdr_argsize_check(rqstp, p);
 }
@@ -385,13 +382,13 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p,
  * XDR encode functions
  */
 int
-nfssvc_encode_void(struct svc_rqst *rqstp, u32 *p, void *dummy)
+nfssvc_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 {
 	return xdr_ressize_check(rqstp, p);
 }
 
 int
-nfssvc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
+nfssvc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_attrstat *resp)
 {
 	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
@@ -399,7 +396,7 @@ nfssvc_encode_attrstat(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfssvc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
+nfssvc_encode_diropres(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_diropres *resp)
 {
 	p = encode_fh(p, &resp->fh);
@@ -408,7 +405,7 @@ nfssvc_encode_diropres(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfssvc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
+nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_readlinkres *resp)
 {
 	*p++ = htonl(resp->len);
@@ -416,7 +413,6 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
 	rqstp->rq_res.page_len = resp->len;
 	if (resp->len & 3) {
 		/* need to pad the tail */
-		rqstp->rq_restailpage = 0;
 		rqstp->rq_res.tail[0].iov_base = p;
 		*p = 0;
 		rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
@@ -425,7 +421,7 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
+nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_readres *resp)
 {
 	p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
@@ -436,7 +432,6 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
 	rqstp->rq_res.page_len = resp->count;
 	if (resp->count & 3) {
 		/* need to pad the tail */
-		rqstp->rq_restailpage = 0;
 		rqstp->rq_res.tail[0].iov_base = p;
 		*p = 0;
 		rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3);
@@ -445,7 +440,7 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfssvc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
+nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_readdirres *resp)
 {
 	xdr_ressize_check(rqstp, p);
@@ -458,12 +453,12 @@ nfssvc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfssvc_encode_statfsres(struct svc_rqst *rqstp, u32 *p,
+nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_statfsres *resp)
 {
 	struct kstatfs	*stat = &resp->stats;
 
-	*p++ = htonl(NFSSVC_MAXBLKSIZE);	/* max transfer size */
+	*p++ = htonl(NFSSVC_MAXBLKSIZE_V2);	/* max transfer size */
 	*p++ = htonl(stat->f_bsize);
 	*p++ = htonl(stat->f_blocks);
 	*p++ = htonl(stat->f_bfree);
@@ -476,7 +471,7 @@ nfssvc_encode_entry(struct readdir_cd *ccd, const char *name,
 		    int namlen, loff_t offset, ino_t ino, unsigned int d_type)
 {
 	struct nfsd_readdirres *cd = container_of(ccd, struct nfsd_readdirres, common);
-	u32	*p = cd->buffer;
+	__be32	*p = cd->buffer;
 	int	buflen, slen;
 
 	/*
@@ -502,7 +497,7 @@ nfssvc_encode_entry(struct readdir_cd *ccd, const char *name,
 	*p++ = htonl((u32) ino);		/* file id */
 	p    = xdr_encode_array(p, name, namlen);/* name length & name */
 	cd->offset = p;			/* remember pointer */
-	*p++ = ~(u32) 0;		/* offset of next entry */
+	*p++ = htonl(~0U);		/* offset of next entry */
 
 	cd->buflen = buflen;
 	cd->buffer = p;
@@ -514,7 +509,7 @@ nfssvc_encode_entry(struct readdir_cd *ccd, const char *name,
  * XDR release functions
  */
 int
-nfssvc_release_fhandle(struct svc_rqst *rqstp, u32 *p,
+nfssvc_release_fhandle(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_fhandle *resp)
 {
 	fh_put(&resp->fh);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c9e3b5a8fe07..f21e917bb8ed 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -54,6 +54,7 @@
 #include <linux/nfsd_idmap.h>
 #include <linux/security.h>
 #endif /* CONFIG_NFSD_V4 */
+#include <linux/jhash.h>
 
 #include <asm/uaccess.h>
 
@@ -81,10 +82,19 @@ struct raparms {
 	dev_t			p_dev;
 	int			p_set;
 	struct file_ra_state	p_ra;
+	unsigned int		p_hindex;
 };
 
+struct raparm_hbucket {
+	struct raparms		*pb_head;
+	spinlock_t		pb_lock;
+} ____cacheline_aligned_in_smp;
+
 static struct raparms *		raparml;
-static struct raparms *		raparm_cache;
+#define RAPARM_HASH_BITS	4
+#define RAPARM_HASH_SIZE	(1<<RAPARM_HASH_BITS)
+#define RAPARM_HASH_MASK	(RAPARM_HASH_SIZE-1)
+static struct raparm_hbucket	raparm_hash[RAPARM_HASH_SIZE];
 
 /* 
  * Called from nfsd_lookup and encode_dirent. Check if we have crossed 
@@ -100,7 +110,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 	struct dentry *dentry = *dpp;
 	struct vfsmount *mnt = mntget(exp->ex_mnt);
 	struct dentry *mounts = dget(dentry);
-	int err = nfs_ok;
+	int err = 0;
 
 	while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));
 
@@ -138,14 +148,15 @@ out:
  *   clients and is explicitly disallowed for NFSv3
  *      NeilBrown <neilb@cse.unsw.edu.au>
  */
-int
+__be32
 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
 					int len, struct svc_fh *resfh)
 {
 	struct svc_export	*exp;
 	struct dentry		*dparent;
 	struct dentry		*dentry;
-	int			err;
+	__be32			err;
+	int			host_err;
 
 	dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
 
@@ -183,7 +194,7 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
 			exp2 = exp_parent(exp->ex_client, mnt, dentry,
 					  &rqstp->rq_chandle);
 			if (IS_ERR(exp2)) {
-				err = PTR_ERR(exp2);
+				host_err = PTR_ERR(exp2);
 				dput(dentry);
 				mntput(mnt);
 				goto out_nfserr;
@@ -200,14 +211,14 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
 	} else {
 		fh_lock(fhp);
 		dentry = lookup_one_len(name, dparent, len);
-		err = PTR_ERR(dentry);
+		host_err = PTR_ERR(dentry);
 		if (IS_ERR(dentry))
 			goto out_nfserr;
 		/*
 		 * check if we have crossed a mount point ...
 		 */
 		if (d_mountpoint(dentry)) {
-			if ((err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
+			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
 				dput(dentry);
 				goto out_nfserr;
 			}
@@ -226,7 +237,7 @@ out:
 	return err;
 
 out_nfserr:
-	err = nfserrno(err);
+	err = nfserrno(host_err);
 	goto out;
 }
 
@@ -234,7 +245,7 @@ out_nfserr:
  * Set various file attributes.
  * N.B. After this call fhp needs an fh_put
  */
-int
+__be32
 nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 	     int check_guard, time_t guardtime)
 {
@@ -243,7 +254,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 	int		accmode = MAY_SATTR;
 	int		ftype = 0;
 	int		imode;
-	int		err;
+	__be32		err;
+	int		host_err;
 	int		size_change = 0;
 
 	if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
@@ -309,19 +321,19 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 		 * If we are changing the size of the file, then
 		 * we need to break all leases.
 		 */
-		err = break_lease(inode, FMODE_WRITE | O_NONBLOCK);
-		if (err == -EWOULDBLOCK)
-			err = -ETIMEDOUT;
-		if (err) /* ENOMEM or EWOULDBLOCK */
+		host_err = break_lease(inode, FMODE_WRITE | O_NONBLOCK);
+		if (host_err == -EWOULDBLOCK)
+			host_err = -ETIMEDOUT;
+		if (host_err) /* ENOMEM or EWOULDBLOCK */
 			goto out_nfserr;
 
-		err = get_write_access(inode);
-		if (err)
+		host_err = get_write_access(inode);
+		if (host_err)
 			goto out_nfserr;
 
 		size_change = 1;
-		err = locks_verify_truncate(inode, NULL, iap->ia_size);
-		if (err) {
+		host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
+		if (host_err) {
 			put_write_access(inode);
 			goto out_nfserr;
 		}
@@ -347,8 +359,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 	err = nfserr_notsync;
 	if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
 		fh_lock(fhp);
-		err = notify_change(dentry, iap);
-		err = nfserrno(err);
+		host_err = notify_change(dentry, iap);
+		err = nfserrno(host_err);
 		fh_unlock(fhp);
 	}
 	if (size_change)
@@ -360,7 +372,7 @@ out:
 	return err;
 
 out_nfserr:
-	err = nfserrno(err);
+	err = nfserrno(host_err);
 	goto out;
 }
 
@@ -410,11 +422,12 @@ out:
 	return error;
 }
 
-int
+__be32
 nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
     struct nfs4_acl *acl)
 {
-	int error;
+	__be32 error;
+	int host_error;
 	struct dentry *dentry;
 	struct inode *inode;
 	struct posix_acl *pacl = NULL, *dpacl = NULL;
@@ -430,22 +443,20 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (S_ISDIR(inode->i_mode))
 		flags = NFS4_ACL_DIR;
 
-	error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
-	if (error == -EINVAL) {
+	host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
+	if (host_error == -EINVAL) {
 		error = nfserr_attrnotsupp;
 		goto out;
-	} else if (error < 0)
+	} else if (host_error < 0)
 		goto out_nfserr;
 
-	if (pacl) {
-		error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);
-		if (error < 0)
-			goto out_nfserr;
-	}
+	host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);
+	if (host_error < 0)
+		goto out_nfserr;
 
-	if (dpacl) {
-		error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);
-		if (error < 0)
+	if (S_ISDIR(inode->i_mode)) {
+		host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);
+		if (host_error < 0)
 			goto out_nfserr;
 	}
 
@@ -456,7 +467,7 @@ out:
 	posix_acl_release(dpacl);
 	return (error);
 out_nfserr:
-	error = nfserrno(error);
+	error = nfserrno(host_error);
 	goto out;
 }
 
@@ -563,14 +574,14 @@ static struct accessmap	nfs3_anyaccess[] = {
     {	0,			0				}
 };
 
-int
+__be32
 nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported)
 {
 	struct accessmap	*map;
 	struct svc_export	*export;
 	struct dentry		*dentry;
 	u32			query, result = 0, sresult = 0;
-	unsigned int		error;
+	__be32			error;
 
 	error = fh_verify(rqstp, fhp, 0, MAY_NOP);
 	if (error)
@@ -590,7 +601,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
 	query = *access;
 	for  (; map->access; map++) {
 		if (map->access & query) {
-			unsigned int err2;
+			__be32 err2;
 
 			sresult |= map->access;
 
@@ -629,13 +640,15 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
  * The access argument indicates the type of open (read/write/lock)
  * N.B. After this call fhp needs an fh_put
  */
-int
+__be32
 nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 			int access, struct file **filp)
 {
 	struct dentry	*dentry;
 	struct inode	*inode;
-	int		flags = O_RDONLY|O_LARGEFILE, err;
+	int		flags = O_RDONLY|O_LARGEFILE;
+	__be32		err;
+	int		host_err;
 
 	/*
 	 * If we get here, then the client has already done an "open",
@@ -665,10 +678,10 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	 * Check to see if there are any leases on this file.
 	 * This may block while leases are broken.
 	 */
-	err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0));
-	if (err == -EWOULDBLOCK)
-		err = -ETIMEDOUT;
-	if (err) /* NOMEM or WOULDBLOCK */
+	host_err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0));
+	if (host_err == -EWOULDBLOCK)
+		host_err = -ETIMEDOUT;
+	if (host_err) /* NOMEM or WOULDBLOCK */
 		goto out_nfserr;
 
 	if (access & MAY_WRITE) {
@@ -681,10 +694,9 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	}
 	*filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_mnt), flags);
 	if (IS_ERR(*filp))
-		err = PTR_ERR(*filp);
+		host_err = PTR_ERR(*filp);
 out_nfserr:
-	if (err)
-		err = nfserrno(err);
+	err = nfserrno(host_err);
 out:
 	return err;
 }
@@ -743,16 +755,20 @@ nfsd_sync_dir(struct dentry *dp)
  * Obtain the readahead parameters for the file
  * specified by (dev, ino).
  */
-static DEFINE_SPINLOCK(ra_lock);
 
 static inline struct raparms *
 nfsd_get_raparms(dev_t dev, ino_t ino)
 {
 	struct raparms	*ra, **rap, **frap = NULL;
 	int depth = 0;
+	unsigned int hash;
+	struct raparm_hbucket *rab;
+
+	hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK;
+	rab = &raparm_hash[hash];
 
-	spin_lock(&ra_lock);
-	for (rap = &raparm_cache; (ra = *rap); rap = &ra->p_next) {
+	spin_lock(&rab->pb_lock);
+	for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) {
 		if (ra->p_ino == ino && ra->p_dev == dev)
 			goto found;
 		depth++;
@@ -761,7 +777,7 @@ nfsd_get_raparms(dev_t dev, ino_t ino)
 	}
 	depth = nfsdstats.ra_size*11/10;
 	if (!frap) {	
-		spin_unlock(&ra_lock);
+		spin_unlock(&rab->pb_lock);
 		return NULL;
 	}
 	rap = frap;
@@ -769,15 +785,16 @@ nfsd_get_raparms(dev_t dev, ino_t ino)
 	ra->p_dev = dev;
 	ra->p_ino = ino;
 	ra->p_set = 0;
+	ra->p_hindex = hash;
 found:
-	if (rap != &raparm_cache) {
+	if (rap != &rab->pb_head) {
 		*rap = ra->p_next;
-		ra->p_next   = raparm_cache;
-		raparm_cache = ra;
+		ra->p_next   = rab->pb_head;
+		rab->pb_head = ra;
 	}
 	ra->p_count++;
 	nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++;
-	spin_unlock(&ra_lock);
+	spin_unlock(&rab->pb_lock);
 	return ra;
 }
 
@@ -791,36 +808,41 @@ nfsd_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset
 {
 	unsigned long count = desc->count;
 	struct svc_rqst *rqstp = desc->arg.data;
+	struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
 
 	if (size > count)
 		size = count;
 
 	if (rqstp->rq_res.page_len == 0) {
 		get_page(page);
-		rqstp->rq_respages[rqstp->rq_resused++] = page;
+		put_page(*pp);
+		*pp = page;
+		rqstp->rq_resused++;
 		rqstp->rq_res.page_base = offset;
 		rqstp->rq_res.page_len = size;
-	} else if (page != rqstp->rq_respages[rqstp->rq_resused-1]) {
+	} else if (page != pp[-1]) {
 		get_page(page);
-		rqstp->rq_respages[rqstp->rq_resused++] = page;
+		put_page(*pp);
+		*pp = page;
+		rqstp->rq_resused++;
 		rqstp->rq_res.page_len += size;
-	} else {
+	} else
 		rqstp->rq_res.page_len += size;
-	}
 
 	desc->count = count - size;
 	desc->written += size;
 	return size;
 }
 
-static int
+static __be32
 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
               loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
 {
 	struct inode *inode;
 	struct raparms	*ra;
 	mm_segment_t	oldfs;
-	int		err;
+	__be32		err;
+	int		host_err;
 
 	err = nfserr_perm;
 	inode = file->f_dentry->d_inode;
@@ -837,32 +859,33 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 		file->f_ra = ra->p_ra;
 
 	if (file->f_op->sendfile && rqstp->rq_sendfile_ok) {
-		svc_pushback_unused_pages(rqstp);
-		err = file->f_op->sendfile(file, &offset, *count,
+		rqstp->rq_resused = 1;
+		host_err = file->f_op->sendfile(file, &offset, *count,
 						 nfsd_read_actor, rqstp);
 	} else {
 		oldfs = get_fs();
 		set_fs(KERNEL_DS);
-		err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
+		host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
 		set_fs(oldfs);
 	}
 
 	/* Write back readahead params */
 	if (ra) {
-		spin_lock(&ra_lock);
+		struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
+		spin_lock(&rab->pb_lock);
 		ra->p_ra = file->f_ra;
 		ra->p_set = 1;
 		ra->p_count--;
-		spin_unlock(&ra_lock);
+		spin_unlock(&rab->pb_lock);
 	}
 
-	if (err >= 0) {
-		nfsdstats.io_read += err;
-		*count = err;
+	if (host_err >= 0) {
+		nfsdstats.io_read += host_err;
+		*count = host_err;
 		err = 0;
 		fsnotify_access(file->f_dentry);
 	} else 
-		err = nfserrno(err);
+		err = nfserrno(host_err);
 out:
 	return err;
 }
@@ -877,7 +900,7 @@ static void kill_suid(struct dentry *dentry)
 	mutex_unlock(&dentry->d_inode->i_mutex);
 }
 
-static int
+static __be32
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 				loff_t offset, struct kvec *vec, int vlen,
 	   			unsigned long cnt, int *stablep)
@@ -886,7 +909,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	struct dentry		*dentry;
 	struct inode		*inode;
 	mm_segment_t		oldfs;
-	int			err = 0;
+	__be32			err = 0;
+	int			host_err;
 	int			stable = *stablep;
 
 #ifdef MSNFS
@@ -922,18 +946,18 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 
 	/* Write the data. */
 	oldfs = get_fs(); set_fs(KERNEL_DS);
-	err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
+	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
 	set_fs(oldfs);
-	if (err >= 0) {
+	if (host_err >= 0) {
 		nfsdstats.io_write += cnt;
 		fsnotify_modify(file->f_dentry);
 	}
 
 	/* clear setuid/setgid flag after write */
-	if (err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID)))
+	if (host_err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID)))
 		kill_suid(dentry);
 
-	if (err >= 0 && stable) {
+	if (host_err >= 0 && stable) {
 		static ino_t	last_ino;
 		static dev_t	last_dev;
 
@@ -959,7 +983,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 
 			if (inode->i_state & I_DIRTY) {
 				dprintk("nfsd: write sync %d\n", current->pid);
-				err=nfsd_sync(file);
+				host_err=nfsd_sync(file);
 			}
 #if 0
 			wake_up(&inode->i_wait);
@@ -969,11 +993,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 		last_dev = inode->i_sb->s_dev;
 	}
 
-	dprintk("nfsd: write complete err=%d\n", err);
-	if (err >= 0)
+	dprintk("nfsd: write complete host_err=%d\n", host_err);
+	if (host_err >= 0)
 		err = 0;
 	else 
-		err = nfserrno(err);
+		err = nfserrno(host_err);
 out:
 	return err;
 }
@@ -983,12 +1007,12 @@ out:
  * on entry. On return, *count contains the number of bytes actually read.
  * N.B. After this call fhp needs an fh_put
  */
-int
+__be32
 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 		loff_t offset, struct kvec *vec, int vlen,
 		unsigned long *count)
 {
-	int		err;
+	__be32		err;
 
 	if (file) {
 		err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,
@@ -1012,12 +1036,12 @@ out:
  * The stable flag requests synchronous writes.
  * N.B. After this call fhp needs an fh_put
  */
-int
+__be32
 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 		loff_t offset, struct kvec *vec, int vlen, unsigned long cnt,
 		int *stablep)
 {
-	int			err = 0;
+	__be32			err = 0;
 
 	if (file) {
 		err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,
@@ -1049,12 +1073,12 @@ out:
  * Unfortunately we cannot lock the file to make sure we return full WCC
  * data to the client, as locking happens lower down in the filesystem.
  */
-int
+__be32
 nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
                loff_t offset, unsigned long count)
 {
 	struct file	*file;
-	int		err;
+	__be32		err;
 
 	if ((u64)count > ~(u64)offset)
 		return nfserr_inval;
@@ -1082,14 +1106,15 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
  *
  * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp
  */
-int
+__be32
 nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		char *fname, int flen, struct iattr *iap,
 		int type, dev_t rdev, struct svc_fh *resfhp)
 {
 	struct dentry	*dentry, *dchild = NULL;
 	struct inode	*dirp;
-	int		err;
+	__be32		err;
+	int		host_err;
 
 	err = nfserr_perm;
 	if (!flen)
@@ -1114,9 +1139,9 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	 */
 	if (!resfhp->fh_dentry) {
 		/* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */
-		fh_lock(fhp);
+		fh_lock_nested(fhp, I_MUTEX_PARENT);
 		dchild = lookup_one_len(fname, dentry, flen);
-		err = PTR_ERR(dchild);
+		host_err = PTR_ERR(dchild);
 		if (IS_ERR(dchild))
 			goto out_nfserr;
 		err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
@@ -1155,22 +1180,22 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	err = nfserr_perm;
 	switch (type) {
 	case S_IFREG:
-		err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
+		host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
 		break;
 	case S_IFDIR:
-		err = vfs_mkdir(dirp, dchild, iap->ia_mode);
+		host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
 		break;
 	case S_IFCHR:
 	case S_IFBLK:
 	case S_IFIFO:
 	case S_IFSOCK:
-		err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
+		host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
 		break;
 	default:
 	        printk("nfsd: bad file type %o in nfsd_create\n", type);
-		err = -EINVAL;
+		host_err = -EINVAL;
 	}
-	if (err < 0)
+	if (host_err < 0)
 		goto out_nfserr;
 
 	if (EX_ISSYNC(fhp->fh_export)) {
@@ -1185,7 +1210,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	 * directories via NFS.
 	 */
 	if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
-		int err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+		__be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
 		if (err2)
 			err = err2;
 	}
@@ -1200,7 +1225,7 @@ out:
 	return err;
 
 out_nfserr:
-	err = nfserrno(err);
+	err = nfserrno(host_err);
 	goto out;
 }
 
@@ -1208,7 +1233,7 @@ out_nfserr:
 /*
  * NFSv3 version of nfsd_create
  */
-int
+__be32
 nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		char *fname, int flen, struct iattr *iap,
 		struct svc_fh *resfhp, int createmode, u32 *verifier,
@@ -1216,7 +1241,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 {
 	struct dentry	*dentry, *dchild = NULL;
 	struct inode	*dirp;
-	int		err;
+	__be32		err;
+	int		host_err;
 	__u32		v_mtime=0, v_atime=0;
 	int		v_mode=0;
 
@@ -1240,13 +1266,13 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	err = nfserr_notdir;
 	if(!dirp->i_op || !dirp->i_op->lookup)
 		goto out;
-	fh_lock(fhp);
+	fh_lock_nested(fhp, I_MUTEX_PARENT);
 
 	/*
 	 * Compose the response file handle.
 	 */
 	dchild = lookup_one_len(fname, dentry, flen);
-	err = PTR_ERR(dchild);
+	host_err = PTR_ERR(dchild);
 	if (IS_ERR(dchild))
 		goto out_nfserr;
 
@@ -1302,8 +1328,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		goto out;
 	}
 
-	err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
-	if (err < 0)
+	host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
+	if (host_err < 0)
 		goto out_nfserr;
 
 	if (EX_ISSYNC(fhp->fh_export)) {
@@ -1332,7 +1358,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	 */
  set_attr:
 	if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID)) != 0) {
- 		int err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
+ 		__be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
 		if (err2)
 			err = err2;
 	}
@@ -1350,7 +1376,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
  	return err;
  
  out_nfserr:
-	err = nfserrno(err);
+	err = nfserrno(host_err);
 	goto out;
 }
 #endif /* CONFIG_NFSD_V3 */
@@ -1360,13 +1386,14 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
  * fits into the buffer. On return, it contains the true length.
  * N.B. After this call fhp needs an fh_put
  */
-int
+__be32
 nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
 {
 	struct dentry	*dentry;
 	struct inode	*inode;
 	mm_segment_t	oldfs;
-	int		err;
+	__be32		err;
+	int		host_err;
 
 	err = fh_verify(rqstp, fhp, S_IFLNK, MAY_NOP);
 	if (err)
@@ -1385,18 +1412,18 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
 	 */
 
 	oldfs = get_fs(); set_fs(KERNEL_DS);
-	err = inode->i_op->readlink(dentry, buf, *lenp);
+	host_err = inode->i_op->readlink(dentry, buf, *lenp);
 	set_fs(oldfs);
 
-	if (err < 0)
+	if (host_err < 0)
 		goto out_nfserr;
-	*lenp = err;
+	*lenp = host_err;
 	err = 0;
 out:
 	return err;
 
 out_nfserr:
-	err = nfserrno(err);
+	err = nfserrno(host_err);
 	goto out;
 }
 
@@ -1404,7 +1431,7 @@ out_nfserr:
  * Create a symlink and look up its inode
  * N.B. After this call _both_ fhp and resfhp need an fh_put
  */
-int
+__be32
 nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 				char *fname, int flen,
 				char *path,  int plen,
@@ -1412,7 +1439,8 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 				struct iattr *iap)
 {
 	struct dentry	*dentry, *dnew;
-	int		err, cerr;
+	__be32		err, cerr;
+	int		host_err;
 	umode_t		mode;
 
 	err = nfserr_noent;
@@ -1428,7 +1456,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	fh_lock(fhp);
 	dentry = fhp->fh_dentry;
 	dnew = lookup_one_len(fname, dentry, flen);
-	err = PTR_ERR(dnew);
+	host_err = PTR_ERR(dnew);
 	if (IS_ERR(dnew))
 		goto out_nfserr;
 
@@ -1440,21 +1468,21 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (unlikely(path[plen] != 0)) {
 		char *path_alloced = kmalloc(plen+1, GFP_KERNEL);
 		if (path_alloced == NULL)
-			err = -ENOMEM;
+			host_err = -ENOMEM;
 		else {
 			strncpy(path_alloced, path, plen);
 			path_alloced[plen] = 0;
-			err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode);
+			host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode);
 			kfree(path_alloced);
 		}
 	} else
-		err = vfs_symlink(dentry->d_inode, dnew, path, mode);
+		host_err = vfs_symlink(dentry->d_inode, dnew, path, mode);
 
-	if (!err)
+	if (!host_err) {
 		if (EX_ISSYNC(fhp->fh_export))
-			err = nfsd_sync_dir(dentry);
-	if (err)
-		err = nfserrno(err);
+			host_err = nfsd_sync_dir(dentry);
+	}
+	err = nfserrno(host_err);
 	fh_unlock(fhp);
 
 	cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
@@ -1464,7 +1492,7 @@ out:
 	return err;
 
 out_nfserr:
-	err = nfserrno(err);
+	err = nfserrno(host_err);
 	goto out;
 }
 
@@ -1472,13 +1500,14 @@ out_nfserr:
  * Create a hardlink
  * N.B. After this call _both_ ffhp and tfhp need an fh_put
  */
-int
+__be32
 nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 				char *name, int len, struct svc_fh *tfhp)
 {
 	struct dentry	*ddir, *dnew, *dold;
 	struct inode	*dirp, *dest;
-	int		err;
+	__be32		err;
+	int		host_err;
 
 	err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_CREATE);
 	if (err)
@@ -1494,29 +1523,30 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 	if (isdotent(name, len))
 		goto out;
 
-	fh_lock(ffhp);
+	fh_lock_nested(ffhp, I_MUTEX_PARENT);
 	ddir = ffhp->fh_dentry;
 	dirp = ddir->d_inode;
 
 	dnew = lookup_one_len(name, ddir, len);
-	err = PTR_ERR(dnew);
+	host_err = PTR_ERR(dnew);
 	if (IS_ERR(dnew))
 		goto out_nfserr;
 
 	dold = tfhp->fh_dentry;
 	dest = dold->d_inode;
 
-	err = vfs_link(dold, dirp, dnew);
-	if (!err) {
+	host_err = vfs_link(dold, dirp, dnew);
+	if (!host_err) {
 		if (EX_ISSYNC(ffhp->fh_export)) {
 			err = nfserrno(nfsd_sync_dir(ddir));
 			write_inode_now(dest, 1);
 		}
+		err = 0;
 	} else {
-		if (err == -EXDEV && rqstp->rq_vers == 2)
+		if (host_err == -EXDEV && rqstp->rq_vers == 2)
 			err = nfserr_acces;
 		else
-			err = nfserrno(err);
+			err = nfserrno(host_err);
 	}
 
 	dput(dnew);
@@ -1526,7 +1556,7 @@ out:
 	return err;
 
 out_nfserr:
-	err = nfserrno(err);
+	err = nfserrno(host_err);
 	goto out_unlock;
 }
 
@@ -1534,13 +1564,14 @@ out_nfserr:
  * Rename a file
  * N.B. After this call _both_ ffhp and tfhp need an fh_put
  */
-int
+__be32
 nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 			    struct svc_fh *tfhp, char *tname, int tlen)
 {
 	struct dentry	*fdentry, *tdentry, *odentry, *ndentry, *trap;
 	struct inode	*fdir, *tdir;
-	int		err;
+	__be32		err;
+	int		host_err;
 
 	err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_REMOVE);
 	if (err)
@@ -1571,22 +1602,22 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	fill_pre_wcc(tfhp);
 
 	odentry = lookup_one_len(fname, fdentry, flen);
-	err = PTR_ERR(odentry);
+	host_err = PTR_ERR(odentry);
 	if (IS_ERR(odentry))
 		goto out_nfserr;
 
-	err = -ENOENT;
+	host_err = -ENOENT;
 	if (!odentry->d_inode)
 		goto out_dput_old;
-	err = -EINVAL;
+	host_err = -EINVAL;
 	if (odentry == trap)
 		goto out_dput_old;
 
 	ndentry = lookup_one_len(tname, tdentry, tlen);
-	err = PTR_ERR(ndentry);
+	host_err = PTR_ERR(ndentry);
 	if (IS_ERR(ndentry))
 		goto out_dput_old;
-	err = -ENOTEMPTY;
+	host_err = -ENOTEMPTY;
 	if (ndentry == trap)
 		goto out_dput_new;
 
@@ -1594,14 +1625,14 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 	if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
 		((atomic_read(&odentry->d_count) > 1)
 		 || (atomic_read(&ndentry->d_count) > 1))) {
-			err = -EPERM;
+			host_err = -EPERM;
 	} else
 #endif
-	err = vfs_rename(fdir, odentry, tdir, ndentry);
-	if (!err && EX_ISSYNC(tfhp->fh_export)) {
-		err = nfsd_sync_dir(tdentry);
-		if (!err)
-			err = nfsd_sync_dir(fdentry);
+	host_err = vfs_rename(fdir, odentry, tdir, ndentry);
+	if (!host_err && EX_ISSYNC(tfhp->fh_export)) {
+		host_err = nfsd_sync_dir(tdentry);
+		if (!host_err)
+			host_err = nfsd_sync_dir(fdentry);
 	}
 
  out_dput_new:
@@ -1609,8 +1640,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
  out_dput_old:
 	dput(odentry);
  out_nfserr:
-	if (err)
-		err = nfserrno(err);
+	err = nfserrno(host_err);
 
 	/* we cannot reply on fh_unlock on the two filehandles,
 	 * as that would do the wrong thing if the two directories
@@ -1629,13 +1659,14 @@ out:
  * Unlink a file or directory
  * N.B. After this call fhp needs an fh_put
  */
-int
+__be32
 nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 				char *fname, int flen)
 {
 	struct dentry	*dentry, *rdentry;
 	struct inode	*dirp;
-	int		err;
+	__be32		err;
+	int		host_err;
 
 	err = nfserr_acces;
 	if (!flen || isdotent(fname, flen))
@@ -1644,12 +1675,12 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	if (err)
 		goto out;
 
-	fh_lock(fhp);
+	fh_lock_nested(fhp, I_MUTEX_PARENT);
 	dentry = fhp->fh_dentry;
 	dirp = dentry->d_inode;
 
 	rdentry = lookup_one_len(fname, dentry, flen);
-	err = PTR_ERR(rdentry);
+	host_err = PTR_ERR(rdentry);
 	if (IS_ERR(rdentry))
 		goto out_nfserr;
 
@@ -1666,22 +1697,23 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 #ifdef MSNFS
 		if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
 			(atomic_read(&rdentry->d_count) > 1)) {
-			err = -EPERM;
+			host_err = -EPERM;
 		} else
 #endif
-		err = vfs_unlink(dirp, rdentry);
+		host_err = vfs_unlink(dirp, rdentry);
 	} else { /* It's RMDIR */
-		err = vfs_rmdir(dirp, rdentry);
+		host_err = vfs_rmdir(dirp, rdentry);
 	}
 
 	dput(rdentry);
 
-	if (err == 0 &&
-	    EX_ISSYNC(fhp->fh_export))
-			err = nfsd_sync_dir(dentry);
+	if (host_err)
+		goto out_nfserr;
+	if (EX_ISSYNC(fhp->fh_export))
+		host_err = nfsd_sync_dir(dentry);
 
 out_nfserr:
-	err = nfserrno(err);
+	err = nfserrno(host_err);
 out:
 	return err;
 }
@@ -1690,11 +1722,12 @@ out:
  * Read entries from a directory.
  * The  NFSv3/4 verifier we ignore for now.
  */
-int
+__be32
 nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, 
 	     struct readdir_cd *cdp, encode_dent_fn func)
 {
-	int		err;
+	__be32		err;
+	int 		host_err;
 	struct file	*file;
 	loff_t		offset = *offsetp;
 
@@ -1716,10 +1749,10 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
 
 	do {
 		cdp->err = nfserr_eof; /* will be cleared on successful read */
-		err = vfs_readdir(file, (filldir_t) func, cdp);
-	} while (err >=0 && cdp->err == nfs_ok);
-	if (err)
-		err = nfserrno(err);
+		host_err = vfs_readdir(file, (filldir_t) func, cdp);
+	} while (host_err >=0 && cdp->err == nfs_ok);
+	if (host_err)
+		err = nfserrno(host_err);
 	else
 		err = cdp->err;
 	*offsetp = vfs_llseek(file, 0, 1);
@@ -1736,10 +1769,10 @@ out:
  * Get file system stats
  * N.B. After this call fhp needs an fh_put
  */
-int
+__be32
 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
 {
-	int err = fh_verify(rqstp, fhp, 0, MAY_NOP);
+	__be32 err = fh_verify(rqstp, fhp, 0, MAY_NOP);
 	if (!err && vfs_statfs(fhp->fh_dentry,stat))
 		err = nfserr_io;
 	return err;
@@ -1748,7 +1781,7 @@ nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
 /*
  * Check for a user's access permissions to this inode.
  */
-int
+__be32
 nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc)
 {
 	struct inode	*inode = dentry->d_inode;
@@ -1829,11 +1862,11 @@ nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc)
 void
 nfsd_racache_shutdown(void)
 {
-	if (!raparm_cache)
+	if (!raparml)
 		return;
 	dprintk("nfsd: freeing readahead buffers.\n");
 	kfree(raparml);
-	raparm_cache = raparml = NULL;
+	raparml = NULL;
 }
 /*
  * Initialize readahead param cache
@@ -1842,19 +1875,31 @@ int
 nfsd_racache_init(int cache_size)
 {
 	int	i;
+	int	j = 0;
+	int	nperbucket;
+
 
-	if (raparm_cache)
+	if (raparml)
 		return 0;
+	if (cache_size < 2*RAPARM_HASH_SIZE)
+		cache_size = 2*RAPARM_HASH_SIZE;
 	raparml = kmalloc(sizeof(struct raparms) * cache_size, GFP_KERNEL);
 
 	if (raparml != NULL) {
 		dprintk("nfsd: allocating %d readahead buffers.\n",
 			cache_size);
+		for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) {
+			raparm_hash[i].pb_head = NULL;
+			spin_lock_init(&raparm_hash[i].pb_lock);
+		}
+		nperbucket = cache_size >> RAPARM_HASH_BITS;
 		memset(raparml, 0, sizeof(struct raparms) * cache_size);
 		for (i = 0; i < cache_size - 1; i++) {
-			raparml[i].p_next = raparml + i + 1;
+			if (i % nperbucket == 0)
+				raparm_hash[j++].pb_head = raparml + i;
+			if (i % nperbucket < nperbucket-1)
+				raparml[i].p_next = raparml + i + 1;
 		}
-		raparm_cache = raparml;
 	} else {
 		printk(KERN_WARNING
 		       "nfsd: Could not allocate memory read-ahead cache.\n");
diff --git a/fs/nls/nls_ascii.c b/fs/nls/nls_ascii.c
index b83381c07ad6..6993faea28ac 100644
--- a/fs/nls/nls_ascii.c
+++ b/fs/nls/nls_ascii.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_ascii.c
+ * linux/fs/nls/nls_ascii.c
  *
  * Charset ascii translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 9de6b495f112..7dfdab98729b 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_base.c
+ * linux/fs/nls/nls_base.c
  *
  * Native language support--charsets and unicode translations.
  * By Gordon Chaffee 1996, 1997
@@ -163,8 +163,6 @@ int register_nls(struct nls_table * nls)
 {
 	struct nls_table ** tmp = &tables;
 
-	if (!nls)
-		return -EINVAL;
 	if (nls->next)
 		return -EBUSY;
 
diff --git a/fs/nls/nls_cp1250.c b/fs/nls/nls_cp1250.c
index 32e78cf95180..570aa69846a0 100644
--- a/fs/nls/nls_cp1250.c
+++ b/fs/nls/nls_cp1250.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp1250.c
+ * linux/fs/nls/nls_cp1250.c
  *
  * Charset cp1250 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_cp1251.c b/fs/nls/nls_cp1251.c
index cb41c8ae4486..f114afa069db 100644
--- a/fs/nls/nls_cp1251.c
+++ b/fs/nls/nls_cp1251.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp1251.c
+ * linux/fs/nls/nls_cp1251.c
  *
  * Charset cp1251 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_cp1255.c b/fs/nls/nls_cp1255.c
index efdeefee5346..e57f2cbf5bc0 100644
--- a/fs/nls/nls_cp1255.c
+++ b/fs/nls/nls_cp1255.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp1255.c
+ * linux/fs/nls/nls_cp1255.c
  *
  * Charset cp1255 translation tables.
  * The Unicode to charset table has only exact mappings.
diff --git a/fs/nls/nls_cp437.c b/fs/nls/nls_cp437.c
index 5c4a1cd685dd..d41930ce4a44 100644
--- a/fs/nls/nls_cp437.c
+++ b/fs/nls/nls_cp437.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp437.c
+ * linux/fs/nls/nls_cp437.c
  *
  * Charset cp437 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_cp737.c b/fs/nls/nls_cp737.c
index e8b3ca8462e7..d21f8790aa19 100644
--- a/fs/nls/nls_cp737.c
+++ b/fs/nls/nls_cp737.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp737.c
+ * linux/fs/nls/nls_cp737.c
  *
  * Charset cp737 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_cp775.c b/fs/nls/nls_cp775.c
index bdb290ea523a..c97714c38a90 100644
--- a/fs/nls/nls_cp775.c
+++ b/fs/nls/nls_cp775.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp775.c
+ * linux/fs/nls/nls_cp775.c
  *
  * Charset cp775 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_cp850.c b/fs/nls/nls_cp850.c
index 25deaa4c8648..843b7d975ba2 100644
--- a/fs/nls/nls_cp850.c
+++ b/fs/nls/nls_cp850.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp850.c
+ * linux/fs/nls/nls_cp850.c
  *
  * Charset cp850 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_cp852.c b/fs/nls/nls_cp852.c
index b822a7b6b970..83cfd844d5ca 100644
--- a/fs/nls/nls_cp852.c
+++ b/fs/nls/nls_cp852.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp852.c
+ * linux/fs/nls/nls_cp852.c
  *
  * Charset cp852 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_cp855.c b/fs/nls/nls_cp855.c
index e8641b7a8b27..9190b7b574ff 100644
--- a/fs/nls/nls_cp855.c
+++ b/fs/nls/nls_cp855.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp855.c
+ * linux/fs/nls/nls_cp855.c
  *
  * Charset cp855 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_cp857.c b/fs/nls/nls_cp857.c
index 7ba589ef8cc0..ef3d36db8082 100644
--- a/fs/nls/nls_cp857.c
+++ b/fs/nls/nls_cp857.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp857.c
+ * linux/fs/nls/nls_cp857.c
  *
  * Charset cp857 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_cp860.c b/fs/nls/nls_cp860.c
index 3b9e49ce8c80..7e2fb6645893 100644
--- a/fs/nls/nls_cp860.c
+++ b/fs/nls/nls_cp860.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp860.c
+ * linux/fs/nls/nls_cp860.c
  *
  * Charset cp860 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_cp861.c b/fs/nls/nls_cp861.c
index 959ff64ee971..66d8d808ccf1 100644
--- a/fs/nls/nls_cp861.c
+++ b/fs/nls/nls_cp861.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp861.c
+ * linux/fs/nls/nls_cp861.c
  *
  * Charset cp861 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_cp862.c b/fs/nls/nls_cp862.c
index b96928f5a023..360ba388485f 100644
--- a/fs/nls/nls_cp862.c
+++ b/fs/nls/nls_cp862.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp862.c
+ * linux/fs/nls/nls_cp862.c
  *
  * Charset cp862 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_cp863.c b/fs/nls/nls_cp863.c
index baa6e0eab1d6..656a93113e37 100644
--- a/fs/nls/nls_cp863.c
+++ b/fs/nls/nls_cp863.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp863.c
+ * linux/fs/nls/nls_cp863.c
  *
  * Charset cp863 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_cp864.c b/fs/nls/nls_cp864.c
index f4dabb037dfe..01ca7309753e 100644
--- a/fs/nls/nls_cp864.c
+++ b/fs/nls/nls_cp864.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp864.c
+ * linux/fs/nls/nls_cp864.c
  *
  * Charset cp864 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_cp865.c b/fs/nls/nls_cp865.c
index 4caeafae32c2..5ba6ee13e109 100644
--- a/fs/nls/nls_cp865.c
+++ b/fs/nls/nls_cp865.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp865.c
+ * linux/fs/nls/nls_cp865.c
  *
  * Charset cp865 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_cp866.c b/fs/nls/nls_cp866.c
index f2b4a9a293fb..c5f82221c9fe 100644
--- a/fs/nls/nls_cp866.c
+++ b/fs/nls/nls_cp866.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp866.c
+ * linux/fs/nls/nls_cp866.c
  *
  * Charset cp866 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_cp869.c b/fs/nls/nls_cp869.c
index 12b436f4eca1..8d4015124d11 100644
--- a/fs/nls/nls_cp869.c
+++ b/fs/nls/nls_cp869.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp869.c
+ * linux/fs/nls/nls_cp869.c
  *
  * Charset cp869 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_cp874.c b/fs/nls/nls_cp874.c
index b5766a01703a..df042052c2db 100644
--- a/fs/nls/nls_cp874.c
+++ b/fs/nls/nls_cp874.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp874.c
+ * linux/fs/nls/nls_cp874.c
  *
  * Charset cp874 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_cp932.c b/fs/nls/nls_cp932.c
index 2c1a17cdcd24..2a9ccf3bc7ef 100644
--- a/fs/nls/nls_cp932.c
+++ b/fs/nls/nls_cp932.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp932.c
+ * linux/fs/nls/nls_cp932.c
  *
  * Charset cp932 translation tables.
  * This translation table was generated automatically, the
diff --git a/fs/nls/nls_cp936.c b/fs/nls/nls_cp936.c
index ef4cef464aba..046fde8170ea 100644
--- a/fs/nls/nls_cp936.c
+++ b/fs/nls/nls_cp936.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp936.c
+ * linux/fs/nls/nls_cp936.c
  *
  * Charset cp936 translation tables.
  * This translation table was generated automatically, the
diff --git a/fs/nls/nls_cp949.c b/fs/nls/nls_cp949.c
index 4351ae21d897..92ae19372f0f 100644
--- a/fs/nls/nls_cp949.c
+++ b/fs/nls/nls_cp949.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp949.c
+ * linux/fs/nls/nls_cp949.c
  *
  * Charset cp949 translation tables.
  * This translation table was generated automatically, the
diff --git a/fs/nls/nls_cp950.c b/fs/nls/nls_cp950.c
index 8167a2858879..5665945fb88c 100644
--- a/fs/nls/nls_cp950.c
+++ b/fs/nls/nls_cp950.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_cp950.c
+ * linux/fs/nls/nls_cp950.c
  *
  * Charset cp950 translation tables.
  * This translation table was generated automatically, the
diff --git a/fs/nls/nls_euc-jp.c b/fs/nls/nls_euc-jp.c
index 06640c3e4021..73293511578b 100644
--- a/fs/nls/nls_euc-jp.c
+++ b/fs/nls/nls_euc-jp.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_euc-jp.c
+ * linux/fs/nls/nls_euc-jp.c
  *
  * Added `OSF/JVC Recommended Code Set Conversion Specification
  * between Japanese EUC and Shift-JIS' support: <hirofumi@mail.parknet.co.jp>
diff --git a/fs/nls/nls_iso8859-1.c b/fs/nls/nls_iso8859-1.c
index 70a2c1956723..2483c3c6c1c1 100644
--- a/fs/nls/nls_iso8859-1.c
+++ b/fs/nls/nls_iso8859-1.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_iso8859-1.c
+ * linux/fs/nls/nls_iso8859-1.c
  *
  * Charset iso8859-1 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_iso8859-13.c b/fs/nls/nls_iso8859-13.c
index 4547035f21a3..7b8721d74368 100644
--- a/fs/nls/nls_iso8859-13.c
+++ b/fs/nls/nls_iso8859-13.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_iso8859-13.c
+ * linux/fs/nls/nls_iso8859-13.c
  *
  * Charset iso8859-13 translation tables.
  * The Unicode to charset table has only exact mappings.
diff --git a/fs/nls/nls_iso8859-14.c b/fs/nls/nls_iso8859-14.c
index 13628d0dd3a9..2e895e638dba 100644
--- a/fs/nls/nls_iso8859-14.c
+++ b/fs/nls/nls_iso8859-14.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_iso8859-14.c
+ * linux/fs/nls/nls_iso8859-14.c
  *
  * Charset iso8859-14 translation tables.
  *
diff --git a/fs/nls/nls_iso8859-15.c b/fs/nls/nls_iso8859-15.c
index 88b924bf7e18..5c91592779fe 100644
--- a/fs/nls/nls_iso8859-15.c
+++ b/fs/nls/nls_iso8859-15.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_iso8859-15.c
+ * linux/fs/nls/nls_iso8859-15.c
  *
  * Charset iso8859-15 translation tables.
  * The Unicode to charset table has only exact mappings.
diff --git a/fs/nls/nls_iso8859-2.c b/fs/nls/nls_iso8859-2.c
index 372528a6c40c..892d38fe9530 100644
--- a/fs/nls/nls_iso8859-2.c
+++ b/fs/nls/nls_iso8859-2.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_iso8859-2.c
+ * linux/fs/nls/nls_iso8859-2.c
  *
  * Charset iso8859-2 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_iso8859-3.c b/fs/nls/nls_iso8859-3.c
index 81b45a234369..49317bcdb4be 100644
--- a/fs/nls/nls_iso8859-3.c
+++ b/fs/nls/nls_iso8859-3.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_iso8859-3.c
+ * linux/fs/nls/nls_iso8859-3.c
  *
  * Charset iso8859-3 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_iso8859-4.c b/fs/nls/nls_iso8859-4.c
index 101b87f5a49b..9f3b9368c2cf 100644
--- a/fs/nls/nls_iso8859-4.c
+++ b/fs/nls/nls_iso8859-4.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_iso8859-4.c
+ * linux/fs/nls/nls_iso8859-4.c
  *
  * Charset iso8859-4 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_iso8859-5.c b/fs/nls/nls_iso8859-5.c
index 83b0084de5eb..001a2bb132ce 100644
--- a/fs/nls/nls_iso8859-5.c
+++ b/fs/nls/nls_iso8859-5.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_iso8859-5.c
+ * linux/fs/nls/nls_iso8859-5.c
  *
  * Charset iso8859-5 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_iso8859-6.c b/fs/nls/nls_iso8859-6.c
index 0c519d65f55b..8cec03d66088 100644
--- a/fs/nls/nls_iso8859-6.c
+++ b/fs/nls/nls_iso8859-6.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_iso8859-6.c
+ * linux/fs/nls/nls_iso8859-6.c
  *
  * Charset iso8859-6 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_iso8859-7.c b/fs/nls/nls_iso8859-7.c
index bd0854625acf..1be707d5ac31 100644
--- a/fs/nls/nls_iso8859-7.c
+++ b/fs/nls/nls_iso8859-7.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_iso8859-7.c
+ * linux/fs/nls/nls_iso8859-7.c
  *
  * Charset iso8859-7 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_iso8859-9.c b/fs/nls/nls_iso8859-9.c
index 988eff791c06..8c0146f73834 100644
--- a/fs/nls/nls_iso8859-9.c
+++ b/fs/nls/nls_iso8859-9.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_iso8859-9.c
+ * linux/fs/nls/nls_iso8859-9.c
  *
  * Charset iso8859-9 translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_koi8-r.c b/fs/nls/nls_koi8-r.c
index 0ad22c249796..fefbe0807265 100644
--- a/fs/nls/nls_koi8-r.c
+++ b/fs/nls/nls_koi8-r.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_koi8-r.c
+ * linux/fs/nls/nls_koi8-r.c
  *
  * Charset koi8-r translation tables.
  * Generated automatically from the Unicode and charset
diff --git a/fs/nls/nls_koi8-ru.c b/fs/nls/nls_koi8-ru.c
index 5db83efe27c6..e7bc1d75c78c 100644
--- a/fs/nls/nls_koi8-ru.c
+++ b/fs/nls/nls_koi8-ru.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_koi8-ru.c
+ * linux/fs/nls/nls_koi8-ru.c
  *
  * Charset koi8-ru translation based on charset koi8-u.
  * The Unicode to charset table has only exact mappings.
diff --git a/fs/nls/nls_koi8-u.c b/fs/nls/nls_koi8-u.c
index 9d30fd61cf46..015070211f22 100644
--- a/fs/nls/nls_koi8-u.c
+++ b/fs/nls/nls_koi8-u.c
@@ -1,5 +1,5 @@
 /*
- * linux/fs/nls_koi8-u.c
+ * linux/fs/nls/nls_koi8-u.c
  *
  * Charset koi8-u translation tables.
  * The Unicode to charset table has only exact mappings.
diff --git a/fs/no-block.c b/fs/no-block.c
new file mode 100644
index 000000000000..d269a93d3467
--- /dev/null
+++ b/fs/no-block.c
@@ -0,0 +1,22 @@
+/* no-block.c: implementation of routines required for non-BLOCK configuration
+ *
+ * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+
+static int no_blkdev_open(struct inode * inode, struct file * filp)
+{
+	return -ENODEV;
+}
+
+const struct file_operations def_blk_fops = {
+	.open		= no_blkdev_open,
+};
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index bc579bfdfbd8..7b2c8f4f6a6f 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -254,7 +254,7 @@ static int ntfs_read_block(struct page *page)
 		bh->b_bdev = vol->sb->s_bdev;
 		/* Is the block within the allowed limits? */
 		if (iblock < lblock) {
-			BOOL is_retry = FALSE;
+			bool is_retry = false;
 
 			/* Convert iblock into corresponding vcn and offset. */
 			vcn = (VCN)iblock << blocksize_bits >>
@@ -292,7 +292,7 @@ lock_retry_remap:
 				goto handle_hole;
 			/* If first try and runlist unmapped, map and retry. */
 			if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
-				is_retry = TRUE;
+				is_retry = true;
 				/*
 				 * Attempt to map runlist, dropping lock for
 				 * the duration.
@@ -558,7 +558,7 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
 	unsigned long flags;
 	unsigned int blocksize, vcn_ofs;
 	int err;
-	BOOL need_end_writeback;
+	bool need_end_writeback;
 	unsigned char blocksize_bits;
 
 	vi = page->mapping->host;
@@ -626,7 +626,7 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
 	rl = NULL;
 	err = 0;
 	do {
-		BOOL is_retry = FALSE;
+		bool is_retry = false;
 
 		if (unlikely(block >= dblock)) {
 			/*
@@ -768,7 +768,7 @@ lock_retry_remap:
 		}
 		/* If first try and runlist unmapped, map and retry. */
 		if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
-			is_retry = TRUE;
+			is_retry = true;
 			/*
 			 * Attempt to map runlist, dropping lock for
 			 * the duration.
@@ -874,12 +874,12 @@ lock_retry_remap:
 	set_page_writeback(page);	/* Keeps try_to_free_buffers() away. */
 
 	/* Submit the prepared buffers for i/o. */
-	need_end_writeback = TRUE;
+	need_end_writeback = true;
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			submit_bh(WRITE, bh);
-			need_end_writeback = FALSE;
+			need_end_writeback = false;
 		}
 		bh = next;
 	} while (bh != head);
@@ -932,7 +932,7 @@ static int ntfs_write_mst_block(struct page *page,
 	runlist_element *rl;
 	int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2;
 	unsigned bh_size, rec_size_bits;
-	BOOL sync, is_mft, page_is_dirty, rec_is_dirty;
+	bool sync, is_mft, page_is_dirty, rec_is_dirty;
 	unsigned char bh_size_bits;
 
 	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
@@ -975,10 +975,10 @@ static int ntfs_write_mst_block(struct page *page,
 
 	rl = NULL;
 	err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0;
-	page_is_dirty = rec_is_dirty = FALSE;
+	page_is_dirty = rec_is_dirty = false;
 	rec_start_bh = NULL;
 	do {
-		BOOL is_retry = FALSE;
+		bool is_retry = false;
 
 		if (likely(block < rec_block)) {
 			if (unlikely(block >= dblock)) {
@@ -1009,10 +1009,10 @@ static int ntfs_write_mst_block(struct page *page,
 			}
 			if (!buffer_dirty(bh)) {
 				/* Clean records are not written out. */
-				rec_is_dirty = FALSE;
+				rec_is_dirty = false;
 				continue;
 			}
-			rec_is_dirty = TRUE;
+			rec_is_dirty = true;
 			rec_start_bh = bh;
 		}
 		/* Need to map the buffer if it is not mapped already. */
@@ -1053,7 +1053,7 @@ lock_retry_remap:
 				 */
 				if (!is_mft && !is_retry &&
 						lcn == LCN_RL_NOT_MAPPED) {
-					is_retry = TRUE;
+					is_retry = true;
 					/*
 					 * Attempt to map runlist, dropping
 					 * lock for the duration.
@@ -1063,7 +1063,7 @@ lock_retry_remap:
 					if (likely(!err2))
 						goto lock_retry_remap;
 					if (err2 == -ENOMEM)
-						page_is_dirty = TRUE;
+						page_is_dirty = true;
 					lcn = err2;
 				} else {
 					err2 = -EIO;
@@ -1145,7 +1145,7 @@ lock_retry_remap:
 				 * means we need to redirty the page before
 				 * returning.
 				 */
-				page_is_dirty = TRUE;
+				page_is_dirty = true;
 				/*
 				 * Remove the buffers in this mft record from
 				 * the list of buffers to write.
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
index 325ce261a107..9393f4b1e298 100644
--- a/fs/ntfs/aops.h
+++ b/fs/ntfs/aops.h
@@ -80,7 +80,7 @@ static inline void ntfs_unmap_page(struct page *page)
  *
  * The unlocked and uptodate page is returned on success or an encoded error
  * on failure. Caller has to test for error using the IS_ERR() macro on the
- * return value. If that evaluates to TRUE, the negative error code can be
+ * return value. If that evaluates to 'true', the negative error code can be
  * obtained using PTR_ERR() on the return value of ntfs_map_page().
  */
 static inline struct page *ntfs_map_page(struct address_space *mapping,
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 6708e1d68a9e..9f08e851cfb6 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -67,7 +67,7 @@
  * the attribute has zero allocated size, i.e. there simply is no runlist.
  *
  * WARNING: If @ctx is supplied, regardless of whether success or failure is
- *	    returned, you need to check IS_ERR(@ctx->mrec) and if TRUE the @ctx
+ *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
  *	    is no longer valid, i.e. you need to either call
  *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
  *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
@@ -90,7 +90,7 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
 	runlist_element *rl;
 	struct page *put_this_page = NULL;
 	int err = 0;
-	BOOL ctx_is_temporary, ctx_needs_reset;
+	bool ctx_is_temporary, ctx_needs_reset;
 	ntfs_attr_search_ctx old_ctx = { NULL, };
 
 	ntfs_debug("Mapping runlist part containing vcn 0x%llx.",
@@ -100,7 +100,7 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
 	else
 		base_ni = ni->ext.base_ntfs_ino;
 	if (!ctx) {
-		ctx_is_temporary = ctx_needs_reset = TRUE;
+		ctx_is_temporary = ctx_needs_reset = true;
 		m = map_mft_record(base_ni);
 		if (IS_ERR(m))
 			return PTR_ERR(m);
@@ -115,7 +115,7 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
 		BUG_ON(IS_ERR(ctx->mrec));
 		a = ctx->attr;
 		BUG_ON(!a->non_resident);
-		ctx_is_temporary = FALSE;
+		ctx_is_temporary = false;
 		end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
 		read_lock_irqsave(&ni->size_lock, flags);
 		allocated_size_vcn = ni->allocated_size >>
@@ -136,7 +136,7 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
 				ni->name, ni->name_len) &&
 				sle64_to_cpu(a->data.non_resident.lowest_vcn)
 				<= vcn && end_vcn >= vcn))
-			ctx_needs_reset = FALSE;
+			ctx_needs_reset = false;
 		else {
 			/* Save the old search context. */
 			old_ctx = *ctx;
@@ -158,7 +158,7 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
 			 * needed attribute extent.
 			 */
 			ntfs_attr_reinit_search_ctx(ctx);
-			ctx_needs_reset = TRUE;
+			ctx_needs_reset = true;
 		}
 	}
 	if (ctx_needs_reset) {
@@ -336,16 +336,16 @@ int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
  *  LCN_EIO	Critical error (runlist/file is corrupt, i/o error, etc).
  *
  * Locking: - The runlist must be locked on entry and is left locked on return.
- *	    - If @write_locked is FALSE, i.e. the runlist is locked for reading,
+ *	    - If @write_locked is 'false', i.e. the runlist is locked for reading,
  *	      the lock may be dropped inside the function so you cannot rely on
  *	      the runlist still being the same when this function returns.
  */
 LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
-		const BOOL write_locked)
+		const bool write_locked)
 {
 	LCN lcn;
 	unsigned long flags;
-	BOOL is_retry = FALSE;
+	bool is_retry = false;
 
 	ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.",
 			ni->mft_no, (unsigned long long)vcn,
@@ -390,7 +390,7 @@ retry_remap:
 			down_read(&ni->runlist.lock);
 		}
 		if (likely(!err)) {
-			is_retry = TRUE;
+			is_retry = true;
 			goto retry_remap;
 		}
 		if (err == -ENOENT)
@@ -449,7 +449,7 @@ retry_remap:
  *	-EIO	- Critical error (runlist/file is corrupt, i/o error, etc).
  *
  * WARNING: If @ctx is supplied, regardless of whether success or failure is
- *	    returned, you need to check IS_ERR(@ctx->mrec) and if TRUE the @ctx
+ *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
  *	    is no longer valid, i.e. you need to either call
  *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
  *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
@@ -469,7 +469,7 @@ runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
 	unsigned long flags;
 	runlist_element *rl;
 	int err = 0;
-	BOOL is_retry = FALSE;
+	bool is_retry = false;
 
 	ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.",
 			ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out");
@@ -518,7 +518,7 @@ retry_remap:
 			 */
 			err = ntfs_map_runlist_nolock(ni, vcn, ctx);
 			if (likely(!err)) {
-				is_retry = TRUE;
+				is_retry = true;
 				goto retry_remap;
 			}
 		}
@@ -558,8 +558,8 @@ retry_remap:
  * On actual error, ntfs_attr_find() returns -EIO.  In this case @ctx->attr is
  * undefined and in particular do not rely on it not changing.
  *
- * If @ctx->is_first is TRUE, the search begins with @ctx->attr itself.  If it
- * is FALSE, the search begins after @ctx->attr.
+ * If @ctx->is_first is 'true', the search begins with @ctx->attr itself.  If it
+ * is 'false', the search begins after @ctx->attr.
  *
  * If @ic is IGNORE_CASE, the @name comparisson is not case sensitive and
  * @ctx->ntfs_ino must be set to the ntfs inode to which the mft record
@@ -599,11 +599,11 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name,
 
 	/*
 	 * Iterate over attributes in mft record starting at @ctx->attr, or the
-	 * attribute following that, if @ctx->is_first is TRUE.
+	 * attribute following that, if @ctx->is_first is 'true'.
 	 */
 	if (ctx->is_first) {
 		a = ctx->attr;
-		ctx->is_first = FALSE;
+		ctx->is_first = false;
 	} else
 		a = (ATTR_RECORD*)((u8*)ctx->attr +
 				le32_to_cpu(ctx->attr->length));
@@ -890,11 +890,11 @@ static int ntfs_external_attr_find(const ATTR_TYPE type,
 		ctx->al_entry = (ATTR_LIST_ENTRY*)al_start;
 	/*
 	 * Iterate over entries in attribute list starting at @ctx->al_entry,
-	 * or the entry following that, if @ctx->is_first is TRUE.
+	 * or the entry following that, if @ctx->is_first is 'true'.
 	 */
 	if (ctx->is_first) {
 		al_entry = ctx->al_entry;
-		ctx->is_first = FALSE;
+		ctx->is_first = false;
 	} else
 		al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry +
 				le16_to_cpu(ctx->al_entry->length));
@@ -1127,7 +1127,7 @@ not_found:
 	ctx->mrec = ctx->base_mrec;
 	ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
 			le16_to_cpu(ctx->mrec->attrs_offset));
-	ctx->is_first = TRUE;
+	ctx->is_first = true;
 	ctx->ntfs_ino = base_ni;
 	ctx->base_ntfs_ino = NULL;
 	ctx->base_mrec = NULL;
@@ -1224,7 +1224,7 @@ static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx,
 		/* Sanity checks are performed elsewhere. */
 		.attr = (ATTR_RECORD*)((u8*)mrec +
 				le16_to_cpu(mrec->attrs_offset)),
-		.is_first = TRUE,
+		.is_first = true,
 		.ntfs_ino = ni,
 	};
 }
@@ -1243,7 +1243,7 @@ void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx)
 {
 	if (likely(!ctx->base_ntfs_ino)) {
 		/* No attribute list. */
-		ctx->is_first = TRUE;
+		ctx->is_first = true;
 		/* Sanity checks are performed elsewhere. */
 		ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
 				le16_to_cpu(ctx->mrec->attrs_offset));
@@ -1585,7 +1585,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
 			return -ENOMEM;
 		/* Start by allocating clusters to hold the attribute value. */
 		rl = ntfs_cluster_alloc(vol, 0, new_size >>
-				vol->cluster_size_bits, -1, DATA_ZONE, TRUE);
+				vol->cluster_size_bits, -1, DATA_ZONE, true);
 		if (IS_ERR(rl)) {
 			err = PTR_ERR(rl);
 			ntfs_debug("Failed to allocate cluster%s, error code "
@@ -1919,7 +1919,7 @@ s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
 	unsigned long flags;
 	int err, mp_size;
 	u32 attr_len = 0; /* Silence stupid gcc warning. */
-	BOOL mp_rebuilt;
+	bool mp_rebuilt;
 
 #ifdef NTFS_DEBUG
 	read_lock_irqsave(&ni->size_lock, flags);
@@ -2222,7 +2222,7 @@ first_alloc:
 	rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits,
 			(new_alloc_size - allocated_size) >>
 			vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ?
-			rl->lcn + rl->length : -1, DATA_ZONE, TRUE);
+			rl->lcn + rl->length : -1, DATA_ZONE, true);
 	if (IS_ERR(rl2)) {
 		err = PTR_ERR(rl2);
 		if (start < 0 || start >= allocated_size)
@@ -2265,7 +2265,7 @@ first_alloc:
 	BUG_ON(!rl2);
 	BUG_ON(!rl2->length);
 	BUG_ON(rl2->lcn < LCN_HOLE);
-	mp_rebuilt = FALSE;
+	mp_rebuilt = false;
 	/* Get the size for the new mapping pairs array for this extent. */
 	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
 	if (unlikely(mp_size <= 0)) {
@@ -2300,7 +2300,7 @@ first_alloc:
 		err = -EOPNOTSUPP;
 		goto undo_alloc;
 	}
-	mp_rebuilt = TRUE;
+	mp_rebuilt = true;
 	/* Generate the mapping pairs array directly into the attr record. */
 	err = ntfs_mapping_pairs_build(vol, (u8*)a +
 			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
index 9074886b44ba..3c8b74c99b80 100644
--- a/fs/ntfs/attrib.h
+++ b/fs/ntfs/attrib.h
@@ -40,10 +40,10 @@
  * Structure must be initialized to zero before the first call to one of the
  * attribute search functions. Initialize @mrec to point to the mft record to
  * search, and @attr to point to the first attribute within @mrec (not necessary
- * if calling the _first() functions), and set @is_first to TRUE (not necessary
+ * if calling the _first() functions), and set @is_first to 'true' (not necessary
  * if calling the _first() functions).
  *
- * If @is_first is TRUE, the search begins with @attr. If @is_first is FALSE,
+ * If @is_first is 'true', the search begins with @attr. If @is_first is 'false',
  * the search begins after @attr. This is so that, after the first call to one
  * of the search attribute functions, we can call the function again, without
  * any modification of the search context, to automagically get the next
@@ -52,7 +52,7 @@
 typedef struct {
 	MFT_RECORD *mrec;
 	ATTR_RECORD *attr;
-	BOOL is_first;
+	bool is_first;
 	ntfs_inode *ntfs_ino;
 	ATTR_LIST_ENTRY *al_entry;
 	ntfs_inode *base_ntfs_ino;
@@ -65,7 +65,7 @@ extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn,
 extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn);
 
 extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
-		const BOOL write_locked);
+		const bool write_locked);
 
 extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni,
 		const VCN vcn, ntfs_attr_search_ctx *ctx);
diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c
index 7a190cdc60e2..0809cf876098 100644
--- a/fs/ntfs/bitmap.c
+++ b/fs/ntfs/bitmap.c
@@ -34,18 +34,18 @@
  * @start_bit:		first bit to set
  * @count:		number of bits to set
  * @value:		value to set the bits to (i.e. 0 or 1)
- * @is_rollback:	if TRUE this is a rollback operation
+ * @is_rollback:	if 'true' this is a rollback operation
  *
  * Set @count bits starting at bit @start_bit in the bitmap described by the
  * vfs inode @vi to @value, where @value is either 0 or 1.
  *
- * @is_rollback should always be FALSE, it is for internal use to rollback
+ * @is_rollback should always be 'false', it is for internal use to rollback
  * errors.  You probably want to use ntfs_bitmap_set_bits_in_run() instead.
  *
  * Return 0 on success and -errno on error.
  */
 int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
-		const s64 count, const u8 value, const BOOL is_rollback)
+		const s64 count, const u8 value, const bool is_rollback)
 {
 	s64 cnt = count;
 	pgoff_t index, end_index;
@@ -172,7 +172,7 @@ rollback:
 		return PTR_ERR(page);
 	if (count != cnt)
 		pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt,
-				value ? 0 : 1, TRUE);
+				value ? 0 : 1, true);
 	else
 		pos = 0;
 	if (!pos) {
diff --git a/fs/ntfs/bitmap.h b/fs/ntfs/bitmap.h
index bb50d6bc9212..72c9ad8be70d 100644
--- a/fs/ntfs/bitmap.h
+++ b/fs/ntfs/bitmap.h
@@ -30,7 +30,7 @@
 #include "types.h"
 
 extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
-		const s64 count, const u8 value, const BOOL is_rollback);
+		const s64 count, const u8 value, const bool is_rollback);
 
 /**
  * ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
@@ -48,7 +48,7 @@ static inline int ntfs_bitmap_set_bits_in_run(struct inode *vi,
 		const s64 start_bit, const s64 count, const u8 value)
 {
 	return __ntfs_bitmap_set_bits_in_run(vi, start_bit, count, value,
-			FALSE);
+			false);
 }
 
 /**
diff --git a/fs/ntfs/collate.h b/fs/ntfs/collate.h
index e027f36fcc2f..aba83347e5fe 100644
--- a/fs/ntfs/collate.h
+++ b/fs/ntfs/collate.h
@@ -26,7 +26,7 @@
 #include "types.h"
 #include "volume.h"
 
-static inline BOOL ntfs_is_collation_rule_supported(COLLATION_RULE cr) {
+static inline bool ntfs_is_collation_rule_supported(COLLATION_RULE cr) {
 	int i;
 
 	/*
@@ -35,12 +35,12 @@ static inline BOOL ntfs_is_collation_rule_supported(COLLATION_RULE cr) {
 	 * now.
 	 */
 	if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG))
-		return FALSE;
+		return false;
 	i = le32_to_cpu(cr);
 	if (likely(((i >= 0) && (i <= 0x02)) ||
 			((i >= 0x10) && (i <= 0x13))))
-		return TRUE;
-	return FALSE;
+		return true;
+	return false;
 }
 
 extern int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 68a607ff9fd3..d98daf59e0b6 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -600,7 +600,7 @@ do_next_cb:
 	rl = NULL;
 	for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn;
 			vcn++) {
-		BOOL is_retry = FALSE;
+		bool is_retry = false;
 
 		if (!rl) {
 lock_retry_remap:
@@ -626,7 +626,7 @@ lock_retry_remap:
 				break;
 			if (is_retry || lcn != LCN_RL_NOT_MAPPED)
 				goto rl_err;
-			is_retry = TRUE;
+			is_retry = true;
 			/*
 			 * Attempt to map runlist, dropping lock for the
 			 * duration.
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index d1e2c6f9f05e..85c36b8ca452 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1149,8 +1149,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	 * Allocate a buffer to store the current name being processed
 	 * converted to format determined by current NLS.
 	 */
-	name = (u8*)kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1,
-			GFP_NOFS);
+	name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS);
 	if (unlikely(!name)) {
 		err = -ENOMEM;
 		goto err_out;
@@ -1191,7 +1190,7 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	 * map the mft record without deadlocking.
 	 */
 	rc = le32_to_cpu(ctx->attr->data.resident.value_length);
-	ir = (INDEX_ROOT*)kmalloc(rc, GFP_NOFS);
+	ir = kmalloc(rc, GFP_NOFS);
 	if (unlikely(!ir)) {
 		err = -ENOMEM;
 		goto err_out;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 2e42c2dcae12..ae2fe0016d2c 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -509,7 +509,7 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
 	u32 attr_rec_len = 0;
 	unsigned blocksize, u;
 	int err, mp_size;
-	BOOL rl_write_locked, was_hole, is_retry;
+	bool rl_write_locked, was_hole, is_retry;
 	unsigned char blocksize_bits;
 	struct {
 		u8 runlist_merged:1;
@@ -543,13 +543,13 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
 				return -ENOMEM;
 		}
 	} while (++u < nr_pages);
-	rl_write_locked = FALSE;
+	rl_write_locked = false;
 	rl = NULL;
 	err = 0;
 	vcn = lcn = -1;
 	vcn_len = 0;
 	lcn_block = -1;
-	was_hole = FALSE;
+	was_hole = false;
 	cpos = pos >> vol->cluster_size_bits;
 	end = pos + bytes;
 	cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
@@ -760,7 +760,7 @@ map_buffer_cached:
 			}
 			continue;
 		}
-		is_retry = FALSE;
+		is_retry = false;
 		if (!rl) {
 			down_read(&ni->runlist.lock);
 retry_remap:
@@ -776,7 +776,7 @@ retry_remap:
 				 * Successful remap, setup the map cache and
 				 * use that to deal with the buffer.
 				 */
-				was_hole = FALSE;
+				was_hole = false;
 				vcn = bh_cpos;
 				vcn_len = rl[1].vcn - vcn;
 				lcn_block = lcn << (vol->cluster_size_bits -
@@ -792,7 +792,7 @@ retry_remap:
 				if (likely(vcn + vcn_len >= cend)) {
 					if (rl_write_locked) {
 						up_write(&ni->runlist.lock);
-						rl_write_locked = FALSE;
+						rl_write_locked = false;
 					} else
 						up_read(&ni->runlist.lock);
 					rl = NULL;
@@ -818,13 +818,13 @@ retry_remap:
 					 */
 					up_read(&ni->runlist.lock);
 					down_write(&ni->runlist.lock);
-					rl_write_locked = TRUE;
+					rl_write_locked = true;
 					goto retry_remap;
 				}
 				err = ntfs_map_runlist_nolock(ni, bh_cpos,
 						NULL);
 				if (likely(!err)) {
-					is_retry = TRUE;
+					is_retry = true;
 					goto retry_remap;
 				}
 				/*
@@ -903,7 +903,7 @@ rl_not_mapped_enoent:
 		if (!rl_write_locked) {
 			up_read(&ni->runlist.lock);
 			down_write(&ni->runlist.lock);
-			rl_write_locked = TRUE;
+			rl_write_locked = true;
 			goto retry_remap;
 		}
 		/* Find the previous last allocated cluster. */
@@ -917,7 +917,7 @@ rl_not_mapped_enoent:
 			}
 		}
 		rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE,
-				FALSE);
+				false);
 		if (IS_ERR(rl2)) {
 			err = PTR_ERR(rl2);
 			ntfs_debug("Failed to allocate cluster, error code %i.",
@@ -1093,7 +1093,7 @@ rl_not_mapped_enoent:
 		status.mft_attr_mapped = 0;
 		status.mp_rebuilt = 0;
 		/* Setup the map cache and use that to deal with the buffer. */
-		was_hole = TRUE;
+		was_hole = true;
 		vcn = bh_cpos;
 		vcn_len = 1;
 		lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
@@ -1105,7 +1105,7 @@ rl_not_mapped_enoent:
 		 */
 		if (likely(vcn + vcn_len >= cend)) {
 			up_write(&ni->runlist.lock);
-			rl_write_locked = FALSE;
+			rl_write_locked = false;
 			rl = NULL;
 		}
 		goto map_buffer_cached;
@@ -1117,7 +1117,7 @@ rl_not_mapped_enoent:
 	if (likely(!err)) {
 		if (unlikely(rl_write_locked)) {
 			up_write(&ni->runlist.lock);
-			rl_write_locked = FALSE;
+			rl_write_locked = false;
 		} else if (unlikely(rl))
 			up_read(&ni->runlist.lock);
 		rl = NULL;
@@ -1528,19 +1528,19 @@ static inline int ntfs_commit_pages_after_non_resident_write(
 	do {
 		s64 bh_pos;
 		struct page *page;
-		BOOL partial;
+		bool partial;
 
 		page = pages[u];
 		bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
 		bh = head = page_buffers(page);
-		partial = FALSE;
+		partial = false;
 		do {
 			s64 bh_end;
 
 			bh_end = bh_pos + blocksize;
 			if (bh_end <= pos || bh_pos >= end) {
 				if (!buffer_uptodate(bh))
-					partial = TRUE;
+					partial = true;
 			} else {
 				set_buffer_uptodate(bh);
 				mark_buffer_dirty(bh);
@@ -1997,7 +1997,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
 				 */
 				down_read(&ni->runlist.lock);
 				lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >>
-						vol->cluster_size_bits, FALSE);
+						vol->cluster_size_bits, false);
 				up_read(&ni->runlist.lock);
 				if (unlikely(lcn < LCN_HOLE)) {
 					status = -EIO;
@@ -2176,20 +2176,18 @@ out:
 /**
  * ntfs_file_aio_write -
  */
-static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const char __user *buf,
-		size_t count, loff_t pos)
+static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	ssize_t ret;
-	struct iovec local_iov = { .iov_base = (void __user *)buf,
-				   .iov_len = count };
 
 	BUG_ON(iocb->ki_pos != pos);
 
 	mutex_lock(&inode->i_mutex);
-	ret = ntfs_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
+	ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);
 	if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 		int err = sync_page_range(inode, mapping, pos, ret);
@@ -2298,13 +2296,11 @@ static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
 
 const struct file_operations ntfs_file_ops = {
 	.llseek		= generic_file_llseek,	 /* Seek inside file. */
-	.read		= generic_file_read,	 /* Read from file. */
+	.read		= do_sync_read,		 /* Read from file. */
 	.aio_read	= generic_file_aio_read, /* Async read from file. */
-	.readv		= generic_file_readv,	 /* Read from file. */
 #ifdef NTFS_RW
 	.write		= ntfs_file_write,	 /* Write to file. */
 	.aio_write	= ntfs_file_aio_write,	 /* Async write to file. */
-	.writev		= ntfs_file_writev,	 /* Write to file. */
 	/*.release	= ,*/			 /* Last file is closed.  See
 						    fs/ext2/file.c::
 						    ext2_release_file() for
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 9f5427c2d105..e32cde486362 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -204,7 +204,7 @@ int ntfs_index_lookup(const void *key, const int key_len,
 		if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
 				&ie->key, key_len)) {
 ir_done:
-			ictx->is_in_root = TRUE;
+			ictx->is_in_root = true;
 			ictx->ir = ir;
 			ictx->actx = actx;
 			ictx->base_ni = base_ni;
@@ -374,7 +374,7 @@ fast_descend_into_child_node:
 		if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
 				&ie->key, key_len)) {
 ia_done:
-			ictx->is_in_root = FALSE;
+			ictx->is_in_root = false;
 			ictx->actx = NULL;
 			ictx->base_ni = NULL;
 			ictx->ia = ia;
diff --git a/fs/ntfs/index.h b/fs/ntfs/index.h
index 846a489e8692..8745469c3989 100644
--- a/fs/ntfs/index.h
+++ b/fs/ntfs/index.h
@@ -37,12 +37,12 @@
  * @entry:	index entry (points into @ir or @ia)
  * @data:	index entry data (points into @entry)
  * @data_len:	length in bytes of @data
- * @is_in_root:	TRUE if @entry is in @ir and FALSE if it is in @ia
+ * @is_in_root:	'true' if @entry is in @ir and 'false' if it is in @ia
  * @ir:		index root if @is_in_root and NULL otherwise
  * @actx:	attribute search context if @is_in_root and NULL otherwise
  * @base_ni:	base inode if @is_in_root and NULL otherwise
- * @ia:		index block if @is_in_root is FALSE and NULL otherwise
- * @page:	page if @is_in_root is FALSE and NULL otherwise
+ * @ia:		index block if @is_in_root is 'false' and NULL otherwise
+ * @page:	page if @is_in_root is 'false' and NULL otherwise
  *
  * @idx_ni is the index inode this context belongs to.
  *
@@ -50,11 +50,11 @@
  * are the index entry data and its length in bytes, respectively.  @data
  * simply points into @entry.  This is probably what the user is interested in.
  *
- * If @is_in_root is TRUE, @entry is in the index root attribute @ir described
+ * If @is_in_root is 'true', @entry is in the index root attribute @ir described
  * by the attribute search context @actx and the base inode @base_ni.  @ia and
  * @page are NULL in this case.
  *
- * If @is_in_root is FALSE, @entry is in the index allocation attribute and @ia
+ * If @is_in_root is 'false', @entry is in the index allocation attribute and @ia
  * and @page point to the index allocation block and the mapped, locked page it
  * is in, respectively.  @ir, @actx and @base_ni are NULL in this case.
  *
@@ -77,7 +77,7 @@ typedef struct {
 	INDEX_ENTRY *entry;
 	void *data;
 	u16 data_len;
-	BOOL is_in_root;
+	bool is_in_root;
 	INDEX_ROOT *ir;
 	ntfs_attr_search_ctx *actx;
 	ntfs_inode *base_ni;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index d313f356e66a..2d3de9c89818 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -137,7 +137,7 @@ static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
 
 		BUG_ON(!na->name);
 		i = na->name_len * sizeof(ntfschar);
-		ni->name = (ntfschar*)kmalloc(i + sizeof(ntfschar), GFP_ATOMIC);
+		ni->name = kmalloc(i + sizeof(ntfschar), GFP_ATOMIC);
 		if (!ni->name)
 			return -ENOMEM;
 		memcpy(ni->name, na->name, i);
@@ -556,8 +556,6 @@ static int ntfs_read_locked_inode(struct inode *vi)
 
 	/* Setup the generic vfs inode parts now. */
 
-	/* This is the optimal IO size (for stat), not the fs block size. */
-	vi->i_blksize = PAGE_CACHE_SIZE;
 	/*
 	 * This is for checking whether an inode has changed w.r.t. a file so
 	 * that the file can be updated if necessary (compare with f_version).
@@ -1234,7 +1232,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 	base_ni = NTFS_I(base_vi);
 
 	/* Just mirror the values from the base inode. */
-	vi->i_blksize	= base_vi->i_blksize;
 	vi->i_version	= base_vi->i_version;
 	vi->i_uid	= base_vi->i_uid;
 	vi->i_gid	= base_vi->i_gid;
@@ -1504,7 +1501,6 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
 	ni	= NTFS_I(vi);
 	base_ni = NTFS_I(base_vi);
 	/* Just mirror the values from the base inode. */
-	vi->i_blksize	= base_vi->i_blksize;
 	vi->i_version	= base_vi->i_version;
 	vi->i_uid	= base_vi->i_uid;
 	vi->i_gid	= base_vi->i_gid;
@@ -2305,7 +2301,7 @@ void ntfs_clear_big_inode(struct inode *vi)
 	}
 #ifdef NTFS_RW
 	if (NInoDirty(ni)) {
-		BOOL was_bad = (is_bad_inode(vi));
+		bool was_bad = (is_bad_inode(vi));
 
 		/* Committing the inode also commits all extent inodes. */
 		ntfs_commit_inode(vi);
@@ -3019,7 +3015,7 @@ int ntfs_write_inode(struct inode *vi, int sync)
 	MFT_RECORD *m;
 	STANDARD_INFORMATION *si;
 	int err = 0;
-	BOOL modified = FALSE;
+	bool modified = false;
 
 	ntfs_debug("Entering for %sinode 0x%lx.", NInoAttr(ni) ? "attr " : "",
 			vi->i_ino);
@@ -3061,7 +3057,7 @@ int ntfs_write_inode(struct inode *vi, int sync)
 				sle64_to_cpu(si->last_data_change_time),
 				(long long)sle64_to_cpu(nt));
 		si->last_data_change_time = nt;
-		modified = TRUE;
+		modified = true;
 	}
 	nt = utc2ntfs(vi->i_ctime);
 	if (si->last_mft_change_time != nt) {
@@ -3070,7 +3066,7 @@ int ntfs_write_inode(struct inode *vi, int sync)
 				sle64_to_cpu(si->last_mft_change_time),
 				(long long)sle64_to_cpu(nt));
 		si->last_mft_change_time = nt;
-		modified = TRUE;
+		modified = true;
 	}
 	nt = utc2ntfs(vi->i_atime);
 	if (si->last_access_time != nt) {
@@ -3079,7 +3075,7 @@ int ntfs_write_inode(struct inode *vi, int sync)
 				(long long)sle64_to_cpu(si->last_access_time),
 				(long long)sle64_to_cpu(nt));
 		si->last_access_time = nt;
-		modified = TRUE;
+		modified = true;
 	}
 	/*
 	 * If we just modified the standard information attribute we need to
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index d34b93cb8b48..1e383328eceb 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -142,13 +142,13 @@ typedef le32 NTFS_RECORD_TYPE;
  * operator! (-8
  */
 
-static inline BOOL __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r)
+static inline bool __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r)
 {
 	return (x == r);
 }
 #define ntfs_is_magic(x, m)	__ntfs_is_magic(x, magic_##m)
 
-static inline BOOL __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r)
+static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r)
 {
 	return (*p == r);
 }
@@ -323,7 +323,7 @@ typedef le64 leMFT_REF;
 #define MREF_LE(x)	((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU))
 #define MSEQNO_LE(x)	((u16)((le64_to_cpu(x) >> 48) & 0xffff))
 
-#define IS_ERR_MREF(x)	(((x) & 0x0000800000000000ULL) ? 1 : 0)
+#define IS_ERR_MREF(x)	(((x) & 0x0000800000000000ULL) ? true : false)
 #define ERR_MREF(x)	((u64)((s64)(x)))
 #define MREF_ERR(x)	((int)((s64)(x)))
 
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
index 29cabf93d2d2..1711b710b641 100644
--- a/fs/ntfs/lcnalloc.c
+++ b/fs/ntfs/lcnalloc.c
@@ -76,7 +76,7 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
  * @count:	number of clusters to allocate
  * @start_lcn:	starting lcn at which to allocate the clusters (or -1 if none)
  * @zone:	zone from which to allocate the clusters
- * @is_extension:	if TRUE, this is an attribute extension
+ * @is_extension:	if 'true', this is an attribute extension
  *
  * Allocate @count clusters preferably starting at cluster @start_lcn or at the
  * current allocator position if @start_lcn is -1, on the mounted ntfs volume
@@ -87,11 +87,11 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
  * @start_vcn specifies the vcn of the first allocated cluster.  This makes
  * merging the resulting runlist with the old runlist easier.
  *
- * If @is_extension is TRUE, the caller is allocating clusters to extend an
- * attribute and if it is FALSE, the caller is allocating clusters to fill a
+ * If @is_extension is 'true', the caller is allocating clusters to extend an
+ * attribute and if it is 'false', the caller is allocating clusters to fill a
  * hole in an attribute.  Practically the difference is that if @is_extension
- * is TRUE the returned runlist will be terminated with LCN_ENOENT and if
- * @is_extension is FALSE the runlist will be terminated with
+ * is 'true' the returned runlist will be terminated with LCN_ENOENT and if
+ * @is_extension is 'false' the runlist will be terminated with
  * LCN_RL_NOT_MAPPED.
  *
  * You need to check the return value with IS_ERR().  If this is false, the
@@ -146,7 +146,7 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
 runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
 		const s64 count, const LCN start_lcn,
 		const NTFS_CLUSTER_ALLOCATION_ZONES zone,
-		const BOOL is_extension)
+		const bool is_extension)
 {
 	LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn;
 	LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size;
@@ -818,7 +818,7 @@ out:
  * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
  * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
  *
- * @is_rollback should always be FALSE, it is for internal use to rollback
+ * @is_rollback should always be 'false', it is for internal use to rollback
  * errors.  You probably want to use ntfs_cluster_free() instead.
  *
  * Note, __ntfs_cluster_free() does not modify the runlist, so you have to
@@ -828,7 +828,7 @@ out:
  * success and -errno on error.
  *
  * WARNING: If @ctx is supplied, regardless of whether success or failure is
- *	    returned, you need to check IS_ERR(@ctx->mrec) and if TRUE the @ctx
+ *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
  *	    is no longer valid, i.e. you need to either call
  *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
  *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
@@ -847,7 +847,7 @@ out:
  *	      and it will be left mapped on return.
  */
 s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
-		ntfs_attr_search_ctx *ctx, const BOOL is_rollback)
+		ntfs_attr_search_ctx *ctx, const bool is_rollback)
 {
 	s64 delta, to_free, total_freed, real_freed;
 	ntfs_volume *vol;
@@ -999,7 +999,7 @@ err_out:
 	 * If rollback fails, set the volume errors flag, emit an error
 	 * message, and return the error code.
 	 */
-	delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, TRUE);
+	delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, true);
 	if (delta < 0) {
 		ntfs_error(vol->sb, "Failed to rollback (error %i).  Leaving "
 				"inconsistent metadata!  Unmount and run "
diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h
index 72cbca7003b2..2adb04316941 100644
--- a/fs/ntfs/lcnalloc.h
+++ b/fs/ntfs/lcnalloc.h
@@ -43,10 +43,10 @@ typedef enum {
 extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol,
 		const VCN start_vcn, const s64 count, const LCN start_lcn,
 		const NTFS_CLUSTER_ALLOCATION_ZONES zone,
-		const BOOL is_extension);
+		const bool is_extension);
 
 extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
-		s64 count, ntfs_attr_search_ctx *ctx, const BOOL is_rollback);
+		s64 count, ntfs_attr_search_ctx *ctx, const bool is_rollback);
 
 /**
  * ntfs_cluster_free - free clusters on an ntfs volume
@@ -86,7 +86,7 @@ extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
  * success and -errno on error.
  *
  * WARNING: If @ctx is supplied, regardless of whether success or failure is
- *	    returned, you need to check IS_ERR(@ctx->mrec) and if TRUE the @ctx
+ *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
  *	    is no longer valid, i.e. you need to either call
  *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
  *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
@@ -107,7 +107,7 @@ extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
 static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
 		s64 count, ntfs_attr_search_ctx *ctx)
 {
-	return __ntfs_cluster_free(ni, start_vcn, count, ctx, FALSE);
+	return __ntfs_cluster_free(ni, start_vcn, count, ctx, false);
 }
 
 extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 4af2ad1193ec..acfed325f4ec 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -41,18 +41,18 @@
  * @rp:		restart page header to check
  * @pos:	position in @vi at which the restart page header resides
  *
- * Check the restart page header @rp for consistency and return TRUE if it is
- * consistent and FALSE otherwise.
+ * Check the restart page header @rp for consistency and return 'true' if it is
+ * consistent and 'false' otherwise.
  *
  * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
  * require the full restart page.
  */
-static BOOL ntfs_check_restart_page_header(struct inode *vi,
+static bool ntfs_check_restart_page_header(struct inode *vi,
 		RESTART_PAGE_HEADER *rp, s64 pos)
 {
 	u32 logfile_system_page_size, logfile_log_page_size;
 	u16 ra_ofs, usa_count, usa_ofs, usa_end = 0;
-	BOOL have_usa = TRUE;
+	bool have_usa = true;
 
 	ntfs_debug("Entering.");
 	/*
@@ -67,7 +67,7 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
 			(logfile_system_page_size - 1) ||
 			logfile_log_page_size & (logfile_log_page_size - 1)) {
 		ntfs_error(vi->i_sb, "$LogFile uses unsupported page size.");
-		return FALSE;
+		return false;
 	}
 	/*
 	 * We must be either at !pos (1st restart page) or at pos = system page
@@ -76,7 +76,7 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
 	if (pos && pos != logfile_system_page_size) {
 		ntfs_error(vi->i_sb, "Found restart area in incorrect "
 				"position in $LogFile.");
-		return FALSE;
+		return false;
 	}
 	/* We only know how to handle version 1.1. */
 	if (sle16_to_cpu(rp->major_ver) != 1 ||
@@ -85,14 +85,14 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
 				"supported.  (This driver supports version "
 				"1.1 only.)", (int)sle16_to_cpu(rp->major_ver),
 				(int)sle16_to_cpu(rp->minor_ver));
-		return FALSE;
+		return false;
 	}
 	/*
 	 * If chkdsk has been run the restart page may not be protected by an
 	 * update sequence array.
 	 */
 	if (ntfs_is_chkd_record(rp->magic) && !le16_to_cpu(rp->usa_count)) {
-		have_usa = FALSE;
+		have_usa = false;
 		goto skip_usa_checks;
 	}
 	/* Verify the size of the update sequence array. */
@@ -100,7 +100,7 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
 	if (usa_count != le16_to_cpu(rp->usa_count)) {
 		ntfs_error(vi->i_sb, "$LogFile restart page specifies "
 				"inconsistent update sequence array count.");
-		return FALSE;
+		return false;
 	}
 	/* Verify the position of the update sequence array. */
 	usa_ofs = le16_to_cpu(rp->usa_ofs);
@@ -109,7 +109,7 @@ static BOOL ntfs_check_restart_page_header(struct inode *vi,
 			usa_end > NTFS_BLOCK_SIZE - sizeof(u16)) {
 		ntfs_error(vi->i_sb, "$LogFile restart page specifies "
 				"inconsistent update sequence array offset.");
-		return FALSE;
+		return false;
 	}
 skip_usa_checks:
 	/*
@@ -124,7 +124,7 @@ skip_usa_checks:
 			ra_ofs > logfile_system_page_size) {
 		ntfs_error(vi->i_sb, "$LogFile restart page specifies "
 				"inconsistent restart area offset.");
-		return FALSE;
+		return false;
 	}
 	/*
 	 * Only restart pages modified by chkdsk are allowed to have chkdsk_lsn
@@ -133,10 +133,10 @@ skip_usa_checks:
 	if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) {
 		ntfs_error(vi->i_sb, "$LogFile restart page is not modified "
 				"by chkdsk but a chkdsk LSN is specified.");
-		return FALSE;
+		return false;
 	}
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
 }
 
 /**
@@ -145,7 +145,7 @@ skip_usa_checks:
  * @rp:		restart page whose restart area to check
  *
  * Check the restart area of the restart page @rp for consistency and return
- * TRUE if it is consistent and FALSE otherwise.
+ * 'true' if it is consistent and 'false' otherwise.
  *
  * This function assumes that the restart page header has already been
  * consistency checked.
@@ -153,7 +153,7 @@ skip_usa_checks:
  * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
  * require the full restart page.
  */
-static BOOL ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
+static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
 {
 	u64 file_size;
 	RESTART_AREA *ra;
@@ -172,7 +172,7 @@ static BOOL ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
 			NTFS_BLOCK_SIZE - sizeof(u16)) {
 		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
 				"inconsistent file offset.");
-		return FALSE;
+		return false;
 	}
 	/*
 	 * Now that we can access ra->client_array_offset, make sure everything
@@ -186,7 +186,7 @@ static BOOL ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
 			ra_ofs + ca_ofs > NTFS_BLOCK_SIZE - sizeof(u16)) {
 		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
 				"inconsistent client array offset.");
-		return FALSE;
+		return false;
 	}
 	/*
 	 * The restart area must end within the system page size both when
@@ -203,7 +203,7 @@ static BOOL ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
 				"of the system page size specified by the "
 				"restart page header and/or the specified "
 				"restart area length is inconsistent.");
-		return FALSE;
+		return false;
 	}
 	/*
 	 * The ra->client_free_list and ra->client_in_use_list must be either
@@ -218,7 +218,7 @@ static BOOL ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
 			le16_to_cpu(ra->log_clients))) {
 		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
 				"overflowing client free and/or in use lists.");
-		return FALSE;
+		return false;
 	}
 	/*
 	 * Check ra->seq_number_bits against ra->file_size for consistency.
@@ -233,24 +233,24 @@ static BOOL ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
 	if (le32_to_cpu(ra->seq_number_bits) != 67 - fs_bits) {
 		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
 				"inconsistent sequence number bits.");
-		return FALSE;
+		return false;
 	}
 	/* The log record header length must be a multiple of 8. */
 	if (((le16_to_cpu(ra->log_record_header_length) + 7) & ~7) !=
 			le16_to_cpu(ra->log_record_header_length)) {
 		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
 				"inconsistent log record header length.");
-		return FALSE;
+		return false;
 	}
 	/* Dito for the log page data offset. */
 	if (((le16_to_cpu(ra->log_page_data_offset) + 7) & ~7) !=
 			le16_to_cpu(ra->log_page_data_offset)) {
 		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
 				"inconsistent log page data offset.");
-		return FALSE;
+		return false;
 	}
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
 }
 
 /**
@@ -259,7 +259,7 @@ static BOOL ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
  * @rp:		restart page whose log client array to check
  *
  * Check the log client array of the restart page @rp for consistency and
- * return TRUE if it is consistent and FALSE otherwise.
+ * return 'true' if it is consistent and 'false' otherwise.
  *
  * This function assumes that the restart page header and the restart area have
  * already been consistency checked.
@@ -268,13 +268,13 @@ static BOOL ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
  * function needs @rp->system_page_size bytes in @rp, i.e. it requires the full
  * restart page and the page must be multi sector transfer deprotected.
  */
-static BOOL ntfs_check_log_client_array(struct inode *vi,
+static bool ntfs_check_log_client_array(struct inode *vi,
 		RESTART_PAGE_HEADER *rp)
 {
 	RESTART_AREA *ra;
 	LOG_CLIENT_RECORD *ca, *cr;
 	u16 nr_clients, idx;
-	BOOL in_free_list, idx_is_first;
+	bool in_free_list, idx_is_first;
 
 	ntfs_debug("Entering.");
 	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
@@ -290,9 +290,9 @@ static BOOL ntfs_check_log_client_array(struct inode *vi,
 	 */
 	nr_clients = le16_to_cpu(ra->log_clients);
 	idx = le16_to_cpu(ra->client_free_list);
-	in_free_list = TRUE;
+	in_free_list = true;
 check_list:
-	for (idx_is_first = TRUE; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--,
+	for (idx_is_first = true; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--,
 			idx = le16_to_cpu(cr->next_client)) {
 		if (!nr_clients || idx >= le16_to_cpu(ra->log_clients))
 			goto err_out;
@@ -302,20 +302,20 @@ check_list:
 		if (idx_is_first) {
 			if (cr->prev_client != LOGFILE_NO_CLIENT)
 				goto err_out;
-			idx_is_first = FALSE;
+			idx_is_first = false;
 		}
 	}
 	/* Switch to and check the in use list if we just did the free list. */
 	if (in_free_list) {
-		in_free_list = FALSE;
+		in_free_list = false;
 		idx = le16_to_cpu(ra->client_in_use_list);
 		goto check_list;
 	}
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
 err_out:
 	ntfs_error(vi->i_sb, "$LogFile log client array is corrupt.");
-	return FALSE;
+	return false;
 }
 
 /**
@@ -468,8 +468,8 @@ err_out:
  * @log_vi:	struct inode of loaded journal $LogFile to check
  * @rp:		[OUT] on success this is a copy of the current restart page
  *
- * Check the $LogFile journal for consistency and return TRUE if it is
- * consistent and FALSE if not.  On success, the current restart page is
+ * Check the $LogFile journal for consistency and return 'true' if it is
+ * consistent and 'false' if not.  On success, the current restart page is
  * returned in *@rp.  Caller must call ntfs_free(*@rp) when finished with it.
  *
  * At present we only check the two restart pages and ignore the log record
@@ -480,7 +480,7 @@ err_out:
  * if the $LogFile was created on a system with a different page size to ours
  * yet and mst deprotection would fail if our page size is smaller.
  */
-BOOL ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
+bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
 {
 	s64 size, pos;
 	LSN rstr1_lsn, rstr2_lsn;
@@ -491,7 +491,7 @@ BOOL ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
 	RESTART_PAGE_HEADER *rstr1_ph = NULL;
 	RESTART_PAGE_HEADER *rstr2_ph = NULL;
 	int log_page_size, log_page_mask, err;
-	BOOL logfile_is_empty = TRUE;
+	bool logfile_is_empty = true;
 	u8 log_page_bits;
 
 	ntfs_debug("Entering.");
@@ -527,7 +527,7 @@ BOOL ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
 	if (size < log_page_size * 2 || (size - log_page_size * 2) >>
 			log_page_bits < MinLogRecordPages) {
 		ntfs_error(vol->sb, "$LogFile is too small.");
-		return FALSE;
+		return false;
 	}
 	/*
 	 * Read through the file looking for a restart page.  Since the restart
@@ -556,7 +556,7 @@ BOOL ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
 		 * means we are done.
 		 */
 		if (!ntfs_is_empty_recordp((le32*)kaddr))
-			logfile_is_empty = FALSE;
+			logfile_is_empty = false;
 		else if (!logfile_is_empty)
 			break;
 		/*
@@ -615,13 +615,13 @@ BOOL ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
 		NVolSetLogFileEmpty(vol);
 is_empty:
 		ntfs_debug("Done.  ($LogFile is empty.)");
-		return TRUE;
+		return true;
 	}
 	if (!rstr1_ph) {
 		BUG_ON(rstr2_ph);
 		ntfs_error(vol->sb, "Did not find any restart pages in "
 				"$LogFile and it was not empty.");
-		return FALSE;
+		return false;
 	}
 	/* If both restart pages were found, use the more recent one. */
 	if (rstr2_ph) {
@@ -648,11 +648,11 @@ is_empty:
 	else
 		ntfs_free(rstr1_ph);
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
 err_out:
 	if (rstr1_ph)
 		ntfs_free(rstr1_ph);
-	return FALSE;
+	return false;
 }
 
 /**
@@ -660,8 +660,8 @@ err_out:
  * @log_vi:	struct inode of loaded journal $LogFile to check
  * @rp:		copy of the current restart page
  *
- * Analyze the $LogFile journal and return TRUE if it indicates the volume was
- * shutdown cleanly and FALSE if not.
+ * Analyze the $LogFile journal and return 'true' if it indicates the volume was
+ * shutdown cleanly and 'false' if not.
  *
  * At present we only look at the two restart pages and ignore the log record
  * pages.  This is a little bit crude in that there will be a very small number
@@ -675,7 +675,7 @@ err_out:
  * is empty this function requires that NVolLogFileEmpty() is true otherwise an
  * empty volume will be reported as dirty.
  */
-BOOL ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp)
+bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp)
 {
 	ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
 	RESTART_AREA *ra;
@@ -684,7 +684,7 @@ BOOL ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp)
 	/* An empty $LogFile must have been clean before it got emptied. */
 	if (NVolLogFileEmpty(vol)) {
 		ntfs_debug("Done.  ($LogFile is empty.)");
-		return TRUE;
+		return true;
 	}
 	BUG_ON(!rp);
 	if (!ntfs_is_rstr_record(rp->magic) &&
@@ -693,7 +693,7 @@ BOOL ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp)
 				"probably a bug in that the $LogFile should "
 				"have been consistency checked before calling "
 				"this function.");
-		return FALSE;
+		return false;
 	}
 	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
 	/*
@@ -704,25 +704,25 @@ BOOL ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp)
 	if (ra->client_in_use_list != LOGFILE_NO_CLIENT &&
 			!(ra->flags & RESTART_VOLUME_IS_CLEAN)) {
 		ntfs_debug("Done.  $LogFile indicates a dirty shutdown.");
-		return FALSE;
+		return false;
 	}
 	/* $LogFile indicates a clean shutdown. */
 	ntfs_debug("Done.  $LogFile indicates a clean shutdown.");
-	return TRUE;
+	return true;
 }
 
 /**
  * ntfs_empty_logfile - empty the contents of the $LogFile journal
  * @log_vi:	struct inode of loaded journal $LogFile to empty
  *
- * Empty the contents of the $LogFile journal @log_vi and return TRUE on
- * success and FALSE on error.
+ * Empty the contents of the $LogFile journal @log_vi and return 'true' on
+ * success and 'false' on error.
  *
  * This function assumes that the $LogFile journal has already been consistency
  * checked by a call to ntfs_check_logfile() and that ntfs_is_logfile_clean()
  * has been used to ensure that the $LogFile is clean.
  */
-BOOL ntfs_empty_logfile(struct inode *log_vi)
+bool ntfs_empty_logfile(struct inode *log_vi)
 {
 	ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
 
@@ -735,13 +735,13 @@ BOOL ntfs_empty_logfile(struct inode *log_vi)
 		if (unlikely(err)) {
 			ntfs_error(vol->sb, "Failed to fill $LogFile with "
 					"0xff bytes (error code %i).", err);
-			return FALSE;
+			return false;
 		}
 		/* Set the flag so we do not have to do it again on remount. */
 		NVolSetLogFileEmpty(vol);
 	}
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
 }
 
 #endif /* NTFS_RW */
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
index a51f3dd0e9eb..9468e1c45ae3 100644
--- a/fs/ntfs/logfile.h
+++ b/fs/ntfs/logfile.h
@@ -296,13 +296,13 @@ typedef struct {
 /* sizeof() = 160 (0xa0) bytes */
 } __attribute__ ((__packed__)) LOG_CLIENT_RECORD;
 
-extern BOOL ntfs_check_logfile(struct inode *log_vi,
+extern bool ntfs_check_logfile(struct inode *log_vi,
 		RESTART_PAGE_HEADER **rp);
 
-extern BOOL ntfs_is_logfile_clean(struct inode *log_vi,
+extern bool ntfs_is_logfile_clean(struct inode *log_vi,
 		const RESTART_PAGE_HEADER *rp);
 
-extern BOOL ntfs_empty_logfile(struct inode *log_vi);
+extern bool ntfs_empty_logfile(struct inode *log_vi);
 
 #endif /* NTFS_RW */
 
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 2438c00ec0ce..2ad5c8b104b9 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -251,7 +251,7 @@ MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
 	int i;
 	unsigned long mft_no = MREF(mref);
 	u16 seq_no = MSEQNO(mref);
-	BOOL destroy_ni = FALSE;
+	bool destroy_ni = false;
 
 	ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
 			mft_no, base_ni->mft_no);
@@ -322,7 +322,7 @@ map_err_out:
 	if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) {
 		ntfs_error(base_ni->vol->sb, "Found stale extent mft "
 				"reference! Corrupt filesystem. Run chkdsk.");
-		destroy_ni = TRUE;
+		destroy_ni = true;
 		m = ERR_PTR(-EIO);
 		goto unm_err_out;
 	}
@@ -331,11 +331,11 @@ map_err_out:
 		ntfs_inode **tmp;
 		int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *);
 
-		tmp = (ntfs_inode **)kmalloc(new_size, GFP_NOFS);
+		tmp = kmalloc(new_size, GFP_NOFS);
 		if (unlikely(!tmp)) {
 			ntfs_error(base_ni->vol->sb, "Failed to allocate "
 					"internal buffer.");
-			destroy_ni = TRUE;
+			destroy_ni = true;
 			m = ERR_PTR(-ENOMEM);
 			goto unm_err_out;
 		}
@@ -857,7 +857,7 @@ err_out:
  * caller is responsible for unlocking the ntfs inode and unpinning the base
  * vfs inode.
  *
- * Return TRUE if the mft record may be written out and FALSE if not.
+ * Return 'true' if the mft record may be written out and 'false' if not.
  *
  * The caller has locked the page and cleared the uptodate flag on it which
  * means that we can safely write out any dirty mft records that do not have
@@ -868,7 +868,7 @@ err_out:
  * Here is a description of the tests we perform:
  *
  * If the inode is found in icache we know the mft record must be a base mft
- * record.  If it is dirty, we do not write it and return FALSE as the vfs
+ * record.  If it is dirty, we do not write it and return 'false' as the vfs
  * inode write paths will result in the access times being updated which would
  * cause the base mft record to be redirtied and written out again.  (We know
  * the access time update will modify the base mft record because Windows
@@ -877,11 +877,11 @@ err_out:
  *
  * If the inode is in icache and not dirty, we attempt to lock the mft record
  * and if we find the lock was already taken, it is not safe to write the mft
- * record and we return FALSE.
+ * record and we return 'false'.
  *
  * If we manage to obtain the lock we have exclusive access to the mft record,
  * which also allows us safe writeout of the mft record.  We then set
- * @locked_ni to the locked ntfs inode and return TRUE.
+ * @locked_ni to the locked ntfs inode and return 'true'.
  *
  * Note we cannot just lock the mft record and sleep while waiting for the lock
  * because this would deadlock due to lock reversal (normally the mft record is
@@ -891,24 +891,24 @@ err_out:
  * If the inode is not in icache we need to perform further checks.
  *
  * If the mft record is not a FILE record or it is a base mft record, we can
- * safely write it and return TRUE.
+ * safely write it and return 'true'.
  *
  * We now know the mft record is an extent mft record.  We check if the inode
  * corresponding to its base mft record is in icache and obtain a reference to
- * it if it is.  If it is not, we can safely write it and return TRUE.
+ * it if it is.  If it is not, we can safely write it and return 'true'.
  *
  * We now have the base inode for the extent mft record.  We check if it has an
  * ntfs inode for the extent mft record attached and if not it is safe to write
- * the extent mft record and we return TRUE.
+ * the extent mft record and we return 'true'.
  *
  * The ntfs inode for the extent mft record is attached to the base inode so we
  * attempt to lock the extent mft record and if we find the lock was already
- * taken, it is not safe to write the extent mft record and we return FALSE.
+ * taken, it is not safe to write the extent mft record and we return 'false'.
  *
  * If we manage to obtain the lock we have exclusive access to the extent mft
  * record, which also allows us safe writeout of the extent mft record.  We
  * set the ntfs inode of the extent mft record clean and then set @locked_ni to
- * the now locked ntfs inode and return TRUE.
+ * the now locked ntfs inode and return 'true'.
  *
  * Note, the reason for actually writing dirty mft records here and not just
  * relying on the vfs inode dirty code paths is that we can have mft records
@@ -922,7 +922,7 @@ err_out:
  * appear if the mft record is reused for a new inode before it got written
  * out.
  */
-BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
+bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 		const MFT_RECORD *m, ntfs_inode **locked_ni)
 {
 	struct super_block *sb = vol->sb;
@@ -977,7 +977,7 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 					mft_no);
 			atomic_dec(&ni->count);
 			iput(vi);
-			return FALSE;
+			return false;
 		}
 		ntfs_debug("Inode 0x%lx is not dirty.", mft_no);
 		/* The inode is not dirty, try to take the mft record lock. */
@@ -986,7 +986,7 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 					"not write it.", mft_no);
 			atomic_dec(&ni->count);
 			iput(vi);
-			return FALSE;
+			return false;
 		}
 		ntfs_debug("Managed to lock mft record 0x%lx, write it.",
 				mft_no);
@@ -995,7 +995,7 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 		 * return the locked ntfs inode.
 		 */
 		*locked_ni = ni;
-		return TRUE;
+		return true;
 	}
 	ntfs_debug("Inode 0x%lx is not in icache.", mft_no);
 	/* The inode is not in icache. */
@@ -1003,13 +1003,13 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 	if (!ntfs_is_mft_record(m->magic)) {
 		ntfs_debug("Mft record 0x%lx is not a FILE record, write it.",
 				mft_no);
-		return TRUE;
+		return true;
 	}
 	/* Write the mft record if it is a base inode. */
 	if (!m->base_mft_record) {
 		ntfs_debug("Mft record 0x%lx is a base record, write it.",
 				mft_no);
-		return TRUE;
+		return true;
 	}
 	/*
 	 * This is an extent mft record.  Check if the inode corresponding to
@@ -1033,7 +1033,7 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 		 */
 		ntfs_debug("Base inode 0x%lx is not in icache, write the "
 				"extent record.", na.mft_no);
-		return TRUE;
+		return true;
 	}
 	ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no);
 	/*
@@ -1051,7 +1051,7 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 		iput(vi);
 		ntfs_debug("Base inode 0x%lx has no attached extent inodes, "
 				"write the extent record.", na.mft_no);
-		return TRUE;
+		return true;
 	}
 	/* Iterate over the attached extent inodes. */
 	extent_nis = ni->ext.extent_ntfs_inos;
@@ -1075,7 +1075,7 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 		ntfs_debug("Extent inode 0x%lx is not attached to its base "
 				"inode 0x%lx, write the extent record.",
 				mft_no, na.mft_no);
-		return TRUE;
+		return true;
 	}
 	ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.",
 			mft_no, na.mft_no);
@@ -1091,7 +1091,7 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 		iput(vi);
 		ntfs_debug("Extent mft record 0x%lx is already locked, do "
 				"not write it.", mft_no);
-		return FALSE;
+		return false;
 	}
 	ntfs_debug("Managed to lock extent mft record 0x%lx, write it.",
 			mft_no);
@@ -1103,7 +1103,7 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 	 * the locked extent ntfs inode.
 	 */
 	*locked_ni = eni;
-	return TRUE;
+	return true;
 }
 
 static const char *es = "  Leaving inconsistent metadata.  Unmount and run "
@@ -1354,7 +1354,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
 		ntfs_unmap_page(page);
 		/* Allocate a cluster from the DATA_ZONE. */
 		rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE,
-				TRUE);
+				true);
 		if (IS_ERR(rl2)) {
 			up_write(&mftbmp_ni->runlist.lock);
 			ntfs_error(vol->sb, "Failed to allocate a cluster for "
@@ -1724,7 +1724,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 	ATTR_RECORD *a = NULL;
 	int ret, mp_size;
 	u32 old_alen = 0;
-	BOOL mp_rebuilt = FALSE;
+	bool mp_rebuilt = false;
 
 	ntfs_debug("Extending mft data allocation.");
 	mft_ni = NTFS_I(vol->mft_ino);
@@ -1780,7 +1780,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 	old_last_vcn = rl[1].vcn;
 	do {
 		rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE,
-				TRUE);
+				true);
 		if (likely(!IS_ERR(rl2)))
 			break;
 		if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
@@ -1884,7 +1884,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 		ret = -EOPNOTSUPP;
 		goto undo_alloc;
 	}
-	mp_rebuilt = TRUE;
+	mp_rebuilt = true;
 	/* Generate the mapping pairs array directly into the attr record. */
 	ret = ntfs_mapping_pairs_build(vol, (u8*)a +
 			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
@@ -2255,7 +2255,7 @@ ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
 	unsigned int ofs;
 	int err;
 	le16 seq_no, usn;
-	BOOL record_formatted = FALSE;
+	bool record_formatted = false;
 
 	if (base_ni) {
 		ntfs_debug("Entering (allocating an extent mft record for "
@@ -2454,7 +2454,7 @@ have_alloc_rec:
 		mft_ni->initialized_size = new_initialized_size;
 	}
 	write_unlock_irqrestore(&mft_ni->size_lock, flags);
-	record_formatted = TRUE;
+	record_formatted = true;
 	/* Update the mft data attribute record to reflect the new sizes. */
 	m = map_mft_record(mft_ni);
 	if (IS_ERR(m)) {
@@ -2638,11 +2638,6 @@ mft_rec_already_initialized:
 		}
 		vi->i_ino = bit;
 		/*
-		 * This is the optimal IO size (for stat), not the fs block
-		 * size.
-		 */
-		vi->i_blksize = PAGE_CACHE_SIZE;
-		/*
 		 * This is for checking whether an inode has changed w.r.t. a
 		 * file so that the file can be updated if necessary (compare
 		 * with f_version).
@@ -2893,7 +2888,7 @@ rollback:
 	if (!(base_ni->nr_extents & 3)) {
 		int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*);
 
-		extent_nis = (ntfs_inode**)kmalloc(new_size, GFP_NOFS);
+		extent_nis = kmalloc(new_size, GFP_NOFS);
 		if (unlikely(!extent_nis)) {
 			ntfs_error(vol->sb, "Failed to allocate internal "
 					"buffer during rollback.%s", es);
diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h
index 639cd1bab08b..b52bf87b99de 100644
--- a/fs/ntfs/mft.h
+++ b/fs/ntfs/mft.h
@@ -111,7 +111,7 @@ static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync)
 	return err;
 }
 
-extern BOOL ntfs_may_write_mft_record(ntfs_volume *vol,
+extern bool ntfs_may_write_mft_record(ntfs_volume *vol,
 		const unsigned long mft_no, const MFT_RECORD *m,
 		ntfs_inode **locked_ni);
 
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
index ddd3d503097c..a12847ae467d 100644
--- a/fs/ntfs/ntfs.h
+++ b/fs/ntfs/ntfs.h
@@ -105,7 +105,7 @@ extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size);
 extern void post_write_mst_fixup(NTFS_RECORD *b);
 
 /* From fs/ntfs/unistr.c */
-extern BOOL ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
+extern bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
 		const ntfschar *s2, size_t s2_len,
 		const IGNORE_CASE_BOOL ic,
 		const ntfschar *upcase, const u32 upcase_size);
diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c
index d0ef4182147b..d80e3315cab0 100644
--- a/fs/ntfs/quota.c
+++ b/fs/ntfs/quota.c
@@ -31,10 +31,10 @@
  * ntfs_mark_quotas_out_of_date - mark the quotas out of date on an ntfs volume
  * @vol:	ntfs volume on which to mark the quotas out of date
  *
- * Mark the quotas out of date on the ntfs volume @vol and return TRUE on
- * success and FALSE on error.
+ * Mark the quotas out of date on the ntfs volume @vol and return 'true' on
+ * success and 'false' on error.
  */
-BOOL ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
+bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
 {
 	ntfs_index_context *ictx;
 	QUOTA_CONTROL_ENTRY *qce;
@@ -46,7 +46,7 @@ BOOL ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
 		goto done;
 	if (!vol->quota_ino || !vol->quota_q_ino) {
 		ntfs_error(vol->sb, "Quota inodes are not open.");
-		return FALSE;
+		return false;
 	}
 	mutex_lock(&vol->quota_q_ino->i_mutex);
 	ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino));
@@ -106,12 +106,12 @@ set_done:
 	NVolSetQuotaOutOfDate(vol);
 done:
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
 err_out:
 	if (ictx)
 		ntfs_index_ctx_put(ictx);
 	mutex_unlock(&vol->quota_q_ino->i_mutex);
-	return FALSE;
+	return false;
 }
 
 #endif /* NTFS_RW */
diff --git a/fs/ntfs/quota.h b/fs/ntfs/quota.h
index 40e4763aa222..4cbe5594c0b0 100644
--- a/fs/ntfs/quota.h
+++ b/fs/ntfs/quota.h
@@ -28,7 +28,7 @@
 #include "types.h"
 #include "volume.h"
 
-extern BOOL ntfs_mark_quotas_out_of_date(ntfs_volume *vol);
+extern bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol);
 
 #endif /* NTFS_RW */
 
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index eb52b801512b..9afd72c7ad0d 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c
@@ -149,10 +149,10 @@ static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl,
  *
  * It is up to the caller to serialize access to the runlists @dst and @src.
  *
- * Return: TRUE   Success, the runlists can be merged.
- *	   FALSE  Failure, the runlists cannot be merged.
+ * Return: true   Success, the runlists can be merged.
+ *	   false  Failure, the runlists cannot be merged.
  */
-static inline BOOL ntfs_are_rl_mergeable(runlist_element *dst,
+static inline bool ntfs_are_rl_mergeable(runlist_element *dst,
 		runlist_element *src)
 {
 	BUG_ON(!dst);
@@ -160,19 +160,19 @@ static inline BOOL ntfs_are_rl_mergeable(runlist_element *dst,
 
 	/* We can merge unmapped regions even if they are misaligned. */
 	if ((dst->lcn == LCN_RL_NOT_MAPPED) && (src->lcn == LCN_RL_NOT_MAPPED))
-		return TRUE;
+		return true;
 	/* If the runs are misaligned, we cannot merge them. */
 	if ((dst->vcn + dst->length) != src->vcn)
-		return FALSE;
+		return false;
 	/* If both runs are non-sparse and contiguous, we can merge them. */
 	if ((dst->lcn >= 0) && (src->lcn >= 0) &&
 			((dst->lcn + dst->length) == src->lcn))
-		return TRUE;
+		return true;
 	/* If we are merging two holes, we can merge them. */
 	if ((dst->lcn == LCN_HOLE) && (src->lcn == LCN_HOLE))
-		return TRUE;
+		return true;
 	/* Cannot merge. */
-	return FALSE;
+	return false;
 }
 
 /**
@@ -218,7 +218,7 @@ static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src)
 static inline runlist_element *ntfs_rl_append(runlist_element *dst,
 		int dsize, runlist_element *src, int ssize, int loc)
 {
-	BOOL right = FALSE;	/* Right end of @src needs merging. */
+	bool right = false;	/* Right end of @src needs merging. */
 	int marker;		/* End of the inserted runs. */
 
 	BUG_ON(!dst);
@@ -285,8 +285,8 @@ static inline runlist_element *ntfs_rl_append(runlist_element *dst,
 static inline runlist_element *ntfs_rl_insert(runlist_element *dst,
 		int dsize, runlist_element *src, int ssize, int loc)
 {
-	BOOL left = FALSE;	/* Left end of @src needs merging. */
-	BOOL disc = FALSE;	/* Discontinuity between @dst and @src. */
+	bool left = false;	/* Left end of @src needs merging. */
+	bool disc = false;	/* Discontinuity between @dst and @src. */
 	int marker;		/* End of the inserted runs. */
 
 	BUG_ON(!dst);
@@ -382,8 +382,8 @@ static inline runlist_element *ntfs_rl_replace(runlist_element *dst,
 		int dsize, runlist_element *src, int ssize, int loc)
 {
 	signed delta;
-	BOOL left = FALSE;	/* Left end of @src needs merging. */
-	BOOL right = FALSE;	/* Right end of @src needs merging. */
+	bool left = false;	/* Left end of @src needs merging. */
+	bool right = false;	/* Right end of @src needs merging. */
 	int tail;		/* Start of tail of @dst. */
 	int marker;		/* End of the inserted runs. */
 
@@ -620,8 +620,8 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl,
 		;
 
 	{
-	BOOL start;
-	BOOL finish;
+	bool start;
+	bool finish;
 	int ds = dend + 1;		/* Number of elements in drl & srl */
 	int ss = sfinal - sstart + 1;
 
@@ -635,7 +635,7 @@ runlist_element *ntfs_runlists_merge(runlist_element *drl,
 	if (finish && !drl[dins].length)
 		ss++;
 	if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn))
-		finish = FALSE;
+		finish = false;
 #if 0
 	ntfs_debug("dfinal = %i, dend = %i", dfinal, dend);
 	ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send);
@@ -1134,7 +1134,7 @@ int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
 {
 	LCN prev_lcn;
 	int rls;
-	BOOL the_end = FALSE;
+	bool the_end = false;
 
 	BUG_ON(first_vcn < 0);
 	BUG_ON(last_vcn < -1);
@@ -1168,7 +1168,7 @@ int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
 			s64 s1 = last_vcn + 1;
 			if (unlikely(rl[1].vcn > s1))
 				length = s1 - rl->vcn;
-			the_end = TRUE;
+			the_end = true;
 		}
 		delta = first_vcn - rl->vcn;
 		/* Header byte + length. */
@@ -1204,7 +1204,7 @@ int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
 			s64 s1 = last_vcn + 1;
 			if (unlikely(rl[1].vcn > s1))
 				length = s1 - rl->vcn;
-			the_end = TRUE;
+			the_end = true;
 		}
 		/* Header byte + length. */
 		rls += 1 + ntfs_get_nr_significant_bytes(length);
@@ -1327,7 +1327,7 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
 	LCN prev_lcn;
 	s8 *dst_max, *dst_next;
 	int err = -ENOSPC;
-	BOOL the_end = FALSE;
+	bool the_end = false;
 	s8 len_len, lcn_len;
 
 	BUG_ON(first_vcn < 0);
@@ -1370,7 +1370,7 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
 			s64 s1 = last_vcn + 1;
 			if (unlikely(rl[1].vcn > s1))
 				length = s1 - rl->vcn;
-			the_end = TRUE;
+			the_end = true;
 		}
 		delta = first_vcn - rl->vcn;
 		/* Write length. */
@@ -1422,7 +1422,7 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
 			s64 s1 = last_vcn + 1;
 			if (unlikely(rl[1].vcn > s1))
 				length = s1 - rl->vcn;
-			the_end = TRUE;
+			the_end = true;
 		}
 		/* Write length. */
 		len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
@@ -1541,7 +1541,7 @@ int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
 	 */
 	if (rl->length) {
 		runlist_element *trl;
-		BOOL is_end;
+		bool is_end;
 
 		ntfs_debug("Shrinking runlist.");
 		/* Determine the runlist size. */
@@ -1555,11 +1555,11 @@ int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
 		 * If a run was partially truncated, make the following runlist
 		 * element a terminator.
 		 */
-		is_end = FALSE;
+		is_end = false;
 		if (rl->length) {
 			rl++;
 			if (!rl->length)
-				is_end = TRUE;
+				is_end = true;
 			rl->vcn = new_length;
 			rl->length = 0;
 		}
@@ -1648,7 +1648,7 @@ int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
 	s64 delta;
 	runlist_element *rl, *rl_end, *rl_real_end, *trl;
 	int old_size;
-	BOOL lcn_fixup = FALSE;
+	bool lcn_fixup = false;
 
 	ntfs_debug("Entering for start 0x%llx, length 0x%llx.",
 			(long long)start, (long long)length);
@@ -1862,7 +1862,7 @@ split_end:
 		if (rl->lcn >= 0) {
 			rl->lcn -= delta;
 			/* Need this in case the lcn just became negative. */
-			lcn_fixup = TRUE;
+			lcn_fixup = true;
 		}
 		rl->length += delta;
 		goto split_end;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 74e0ee8fce72..03a391ac7145 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -74,18 +74,18 @@ const option_t on_errors_arr[] = {
  *
  * Copied from old ntfs driver (which copied from vfat driver).
  */
-static int simple_getbool(char *s, BOOL *setval)
+static int simple_getbool(char *s, bool *setval)
 {
 	if (s) {
 		if (!strcmp(s, "1") || !strcmp(s, "yes") || !strcmp(s, "true"))
-			*setval = TRUE;
+			*setval = true;
 		else if (!strcmp(s, "0") || !strcmp(s, "no") ||
 							!strcmp(s, "false"))
-			*setval = FALSE;
+			*setval = false;
 		else
 			return 0;
 	} else
-		*setval = TRUE;
+		*setval = true;
 	return 1;
 }
 
@@ -96,7 +96,7 @@ static int simple_getbool(char *s, BOOL *setval)
  *
  * Parse the recognized options in @opt for the ntfs volume described by @vol.
  */
-static BOOL parse_options(ntfs_volume *vol, char *opt)
+static bool parse_options(ntfs_volume *vol, char *opt)
 {
 	char *p, *v, *ov;
 	static char *utf8 = "utf8";
@@ -137,7 +137,7 @@ static BOOL parse_options(ntfs_volume *vol, char *opt)
 	}
 #define NTFS_GETOPT_BOOL(option, variable)				\
 	if (!strcmp(p, option)) {					\
-		BOOL val;						\
+		bool val;						\
 		if (!simple_getbool(v, &val))				\
 			goto needs_bool;				\
 		variable = val;						\
@@ -170,7 +170,7 @@ static BOOL parse_options(ntfs_volume *vol, char *opt)
 		else NTFS_GETOPT_OCTAL("fmask", fmask)
 		else NTFS_GETOPT_OCTAL("dmask", dmask)
 		else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier)
-		else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, TRUE)
+		else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, true)
 		else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files)
 		else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive)
 		else NTFS_GETOPT_BOOL("disable_sparse", disable_sparse)
@@ -194,7 +194,7 @@ use_utf8:
 				if (!old_nls) {
 					ntfs_error(vol->sb, "NLS character set "
 							"%s not found.", v);
-					return FALSE;
+					return false;
 				}
 				ntfs_error(vol->sb, "NLS character set %s not "
 						"found. Using previous one %s.",
@@ -205,14 +205,14 @@ use_utf8:
 					unload_nls(old_nls);
 			}
 		} else if (!strcmp(p, "utf8")) {
-			BOOL val = FALSE;
+			bool val = false;
 			ntfs_warning(vol->sb, "Option utf8 is no longer "
 				   "supported, using option nls=utf8. Please "
 				   "use option nls=utf8 in the future and "
 				   "make sure utf8 is compiled either as a "
 				   "module or into the kernel.");
 			if (!v || !*v)
-				val = TRUE;
+				val = true;
 			else if (!simple_getbool(v, &val))
 				goto needs_bool;
 			if (val) {
@@ -231,7 +231,7 @@ use_utf8:
 	}
 no_mount_options:
 	if (errors && !sloppy)
-		return FALSE;
+		return false;
 	if (sloppy)
 		ntfs_warning(vol->sb, "Sloppy option given. Ignoring "
 				"unrecognized mount option(s) and continuing.");
@@ -240,14 +240,14 @@ no_mount_options:
 		if (!on_errors) {
 			ntfs_error(vol->sb, "Invalid errors option argument "
 					"or bug in options parser.");
-			return FALSE;
+			return false;
 		}
 	}
 	if (nls_map) {
 		if (vol->nls_map && vol->nls_map != nls_map) {
 			ntfs_error(vol->sb, "Cannot change NLS character set "
 					"on remount.");
-			return FALSE;
+			return false;
 		} /* else (!vol->nls_map) */
 		ntfs_debug("Using NLS character set %s.", nls_map->charset);
 		vol->nls_map = nls_map;
@@ -257,7 +257,7 @@ no_mount_options:
 			if (!vol->nls_map) {
 				ntfs_error(vol->sb, "Failed to load default "
 						"NLS character set.");
-				return FALSE;
+				return false;
 			}
 			ntfs_debug("Using default NLS character set (%s).",
 					vol->nls_map->charset);
@@ -268,7 +268,7 @@ no_mount_options:
 				mft_zone_multiplier) {
 			ntfs_error(vol->sb, "Cannot change mft_zone_multiplier "
 					"on remount.");
-			return FALSE;
+			return false;
 		}
 		if (mft_zone_multiplier < 1 || mft_zone_multiplier > 4) {
 			ntfs_error(vol->sb, "Invalid mft_zone_multiplier. "
@@ -318,16 +318,16 @@ no_mount_options:
 				NVolSetSparseEnabled(vol);
 		}
 	}
-	return TRUE;
+	return true;
 needs_arg:
 	ntfs_error(vol->sb, "The %s option requires an argument.", p);
-	return FALSE;
+	return false;
 needs_bool:
 	ntfs_error(vol->sb, "The %s option requires a boolean argument.", p);
-	return FALSE;
+	return false;
 needs_val:
 	ntfs_error(vol->sb, "Invalid %s option argument: %s", p, ov);
-	return FALSE;
+	return false;
 }
 
 #ifdef NTFS_RW
@@ -543,16 +543,16 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
  * is_boot_sector_ntfs - check whether a boot sector is a valid NTFS boot sector
  * @sb:		Super block of the device to which @b belongs.
  * @b:		Boot sector of device @sb to check.
- * @silent:	If TRUE, all output will be silenced.
+ * @silent:	If 'true', all output will be silenced.
  *
  * is_boot_sector_ntfs() checks whether the boot sector @b is a valid NTFS boot
- * sector. Returns TRUE if it is valid and FALSE if not.
+ * sector. Returns 'true' if it is valid and 'false' if not.
  *
  * @sb is only needed for warning/error output, i.e. it can be NULL when silent
- * is TRUE.
+ * is 'true'.
  */
-static BOOL is_boot_sector_ntfs(const struct super_block *sb,
-		const NTFS_BOOT_SECTOR *b, const BOOL silent)
+static bool is_boot_sector_ntfs(const struct super_block *sb,
+		const NTFS_BOOT_SECTOR *b, const bool silent)
 {
 	/*
 	 * Check that checksum == sum of u32 values from b to the checksum
@@ -620,9 +620,9 @@ static BOOL is_boot_sector_ntfs(const struct super_block *sb,
 	 */
 	if (!silent && b->end_of_sector_marker != const_cpu_to_le16(0xaa55))
 		ntfs_warning(sb, "Invalid end of sector marker.");
-	return TRUE;
+	return true;
 not_ntfs:
-	return FALSE;
+	return false;
 }
 
 /**
@@ -732,9 +732,9 @@ hotfix_primary_boot_sector:
  * @b:		boot sector to parse
  *
  * Parse the ntfs boot sector @b and store all imporant information therein in
- * the ntfs super block @vol.  Return TRUE on success and FALSE on error.
+ * the ntfs super block @vol.  Return 'true' on success and 'false' on error.
  */
-static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
+static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 {
 	unsigned int sectors_per_cluster_bits, nr_hidden_sects;
 	int clusters_per_mft_record, clusters_per_index_record;
@@ -751,7 +751,7 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 				"device block size (%lu).  This is not "
 				"supported.  Sorry.", vol->sector_size,
 				vol->sb->s_blocksize);
-		return FALSE;
+		return false;
 	}
 	ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster);
 	sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1;
@@ -770,7 +770,7 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 		ntfs_error(vol->sb, "Cluster size (%i) is smaller than the "
 				"sector size (%i).  This is not supported.  "
 				"Sorry.", vol->cluster_size, vol->sector_size);
-		return FALSE;
+		return false;
 	}
 	clusters_per_mft_record = b->clusters_per_mft_record;
 	ntfs_debug("clusters_per_mft_record = %i (0x%x)",
@@ -802,7 +802,7 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 				"PAGE_CACHE_SIZE on your system (%lu).  "
 				"This is not supported.  Sorry.",
 				vol->mft_record_size, PAGE_CACHE_SIZE);
-		return FALSE;
+		return false;
 	}
 	/* We cannot support mft record sizes below the sector size. */
 	if (vol->mft_record_size < vol->sector_size) {
@@ -810,7 +810,7 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 				"sector size (%i).  This is not supported.  "
 				"Sorry.", vol->mft_record_size,
 				vol->sector_size);
-		return FALSE;
+		return false;
 	}
 	clusters_per_index_record = b->clusters_per_index_record;
 	ntfs_debug("clusters_per_index_record = %i (0x%x)",
@@ -841,7 +841,7 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 				"the sector size (%i).  This is not "
 				"supported.  Sorry.", vol->index_record_size,
 				vol->sector_size);
-		return FALSE;
+		return false;
 	}
 	/*
 	 * Get the size of the volume in clusters and check for 64-bit-ness.
@@ -851,7 +851,7 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 	ll = sle64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits;
 	if ((u64)ll >= 1ULL << 32) {
 		ntfs_error(vol->sb, "Cannot handle 64-bit clusters.  Sorry.");
-		return FALSE;
+		return false;
 	}
 	vol->nr_clusters = ll;
 	ntfs_debug("vol->nr_clusters = 0x%llx", (long long)vol->nr_clusters);
@@ -867,7 +867,7 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 					"Maximum supported is 2TiB.  Sorry.",
 					(unsigned long long)ll >> (40 -
 					vol->cluster_size_bits));
-			return FALSE;
+			return false;
 		}
 	}
 	ll = sle64_to_cpu(b->mft_lcn);
@@ -875,7 +875,7 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 		ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of "
 				"volume.  Weird.", (unsigned long long)ll,
 				(unsigned long long)ll);
-		return FALSE;
+		return false;
 	}
 	vol->mft_lcn = ll;
 	ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn);
@@ -884,7 +884,7 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 		ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end "
 				"of volume.  Weird.", (unsigned long long)ll,
 				(unsigned long long)ll);
-		return FALSE;
+		return false;
 	}
 	vol->mftmirr_lcn = ll;
 	ntfs_debug("vol->mftmirr_lcn = 0x%llx", (long long)vol->mftmirr_lcn);
@@ -907,7 +907,7 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 	vol->serial_no = le64_to_cpu(b->volume_serial_number);
 	ntfs_debug("vol->serial_no = 0x%llx",
 			(unsigned long long)vol->serial_no);
-	return TRUE;
+	return true;
 }
 
 /**
@@ -1000,9 +1000,9 @@ static void ntfs_setup_allocators(ntfs_volume *vol)
  * load_and_init_mft_mirror - load and setup the mft mirror inode for a volume
  * @vol:	ntfs super block describing device whose mft mirror to load
  *
- * Return TRUE on success or FALSE on error.
+ * Return 'true' on success or 'false' on error.
  */
-static BOOL load_and_init_mft_mirror(ntfs_volume *vol)
+static bool load_and_init_mft_mirror(ntfs_volume *vol)
 {
 	struct inode *tmp_ino;
 	ntfs_inode *tmp_ni;
@@ -1014,7 +1014,7 @@ static BOOL load_and_init_mft_mirror(ntfs_volume *vol)
 		if (!IS_ERR(tmp_ino))
 			iput(tmp_ino);
 		/* Caller will display error message. */
-		return FALSE;
+		return false;
 	}
 	/*
 	 * Re-initialize some specifics about $MFTMirr's inode as
@@ -1041,20 +1041,20 @@ static BOOL load_and_init_mft_mirror(ntfs_volume *vol)
 	tmp_ni->itype.index.block_size_bits = vol->mft_record_size_bits;
 	vol->mftmirr_ino = tmp_ino;
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
 }
 
 /**
  * check_mft_mirror - compare contents of the mft mirror with the mft
  * @vol:	ntfs super block describing device whose mft mirror to check
  *
- * Return TRUE on success or FALSE on error.
+ * Return 'true' on success or 'false' on error.
  *
  * Note, this function also results in the mft mirror runlist being completely
  * mapped into memory.  The mft mirror write code requires this and will BUG()
  * should it find an unmapped runlist element.
  */
-static BOOL check_mft_mirror(ntfs_volume *vol)
+static bool check_mft_mirror(ntfs_volume *vol)
 {
 	struct super_block *sb = vol->sb;
 	ntfs_inode *mirr_ni;
@@ -1086,7 +1086,7 @@ static BOOL check_mft_mirror(ntfs_volume *vol)
 					index);
 			if (IS_ERR(mft_page)) {
 				ntfs_error(sb, "Failed to read $MFT.");
-				return FALSE;
+				return false;
 			}
 			kmft = page_address(mft_page);
 			/* Get the $MFTMirr page. */
@@ -1110,7 +1110,7 @@ mm_unmap_out:
 				ntfs_unmap_page(mirr_page);
 mft_unmap_out:
 				ntfs_unmap_page(mft_page);
-				return FALSE;
+				return false;
 			}
 		}
 		/* Do not check the mirror record if it is not in use. */
@@ -1169,21 +1169,21 @@ mft_unmap_out:
 			ntfs_error(sb, "$MFTMirr location mismatch.  "
 					"Run chkdsk.");
 			up_read(&mirr_ni->runlist.lock);
-			return FALSE;
+			return false;
 		}
 	} while (rl2[i++].length);
 	up_read(&mirr_ni->runlist.lock);
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
 }
 
 /**
  * load_and_check_logfile - load and check the logfile inode for a volume
  * @vol:	ntfs super block describing device whose logfile to load
  *
- * Return TRUE on success or FALSE on error.
+ * Return 'true' on success or 'false' on error.
  */
-static BOOL load_and_check_logfile(ntfs_volume *vol,
+static bool load_and_check_logfile(ntfs_volume *vol,
 		RESTART_PAGE_HEADER **rp)
 {
 	struct inode *tmp_ino;
@@ -1194,17 +1194,17 @@ static BOOL load_and_check_logfile(ntfs_volume *vol,
 		if (!IS_ERR(tmp_ino))
 			iput(tmp_ino);
 		/* Caller will display error message. */
-		return FALSE;
+		return false;
 	}
 	if (!ntfs_check_logfile(tmp_ino, rp)) {
 		iput(tmp_ino);
 		/* ntfs_check_logfile() will have displayed error output. */
-		return FALSE;
+		return false;
 	}
 	NInoSetSparseDisabled(NTFS_I(tmp_ino));
 	vol->logfile_ino = tmp_ino;
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
 }
 
 #define NTFS_HIBERFIL_HEADER_SIZE	4096
@@ -1329,10 +1329,10 @@ iput_out:
  * load_and_init_quota - load and setup the quota file for a volume if present
  * @vol:	ntfs super block describing device whose quota file to load
  *
- * Return TRUE on success or FALSE on error.  If $Quota is not present, we
+ * Return 'true' on success or 'false' on error.  If $Quota is not present, we
  * leave vol->quota_ino as NULL and return success.
  */
-static BOOL load_and_init_quota(ntfs_volume *vol)
+static bool load_and_init_quota(ntfs_volume *vol)
 {
 	MFT_REF mref;
 	struct inode *tmp_ino;
@@ -1366,11 +1366,11 @@ static BOOL load_and_init_quota(ntfs_volume *vol)
 			 * not enabled.
 			 */
 			NVolSetQuotaOutOfDate(vol);
-			return TRUE;
+			return true;
 		}
 		/* A real error occured. */
 		ntfs_error(vol->sb, "Failed to find inode number for $Quota.");
-		return FALSE;
+		return false;
 	}
 	/* We do not care for the type of match that was found. */
 	kfree(name);
@@ -1380,25 +1380,25 @@ static BOOL load_and_init_quota(ntfs_volume *vol)
 		if (!IS_ERR(tmp_ino))
 			iput(tmp_ino);
 		ntfs_error(vol->sb, "Failed to load $Quota.");
-		return FALSE;
+		return false;
 	}
 	vol->quota_ino = tmp_ino;
 	/* Get the $Q index allocation attribute. */
 	tmp_ino = ntfs_index_iget(vol->quota_ino, Q, 2);
 	if (IS_ERR(tmp_ino)) {
 		ntfs_error(vol->sb, "Failed to load $Quota/$Q index.");
-		return FALSE;
+		return false;
 	}
 	vol->quota_q_ino = tmp_ino;
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
 }
 
 /**
  * load_and_init_usnjrnl - load and setup the transaction log if present
  * @vol:	ntfs super block describing device whose usnjrnl file to load
  *
- * Return TRUE on success or FALSE on error.
+ * Return 'true' on success or 'false' on error.
  *
  * If $UsnJrnl is not present or in the process of being disabled, we set
  * NVolUsnJrnlStamped() and return success.
@@ -1408,7 +1408,7 @@ static BOOL load_and_init_quota(ntfs_volume *vol)
  * stamped and nothing has been logged since, we also set NVolUsnJrnlStamped()
  * and return success.
  */
-static BOOL load_and_init_usnjrnl(ntfs_volume *vol)
+static bool load_and_init_usnjrnl(ntfs_volume *vol)
 {
 	MFT_REF mref;
 	struct inode *tmp_ino;
@@ -1450,12 +1450,12 @@ not_enabled:
 			 * transaction logging is not enabled.
 			 */
 			NVolSetUsnJrnlStamped(vol);
-			return TRUE;
+			return true;
 		}
 		/* A real error occured. */
 		ntfs_error(vol->sb, "Failed to find inode number for "
 				"$UsnJrnl.");
-		return FALSE;
+		return false;
 	}
 	/* We do not care for the type of match that was found. */
 	kfree(name);
@@ -1465,7 +1465,7 @@ not_enabled:
 		if (!IS_ERR(tmp_ino))
 			iput(tmp_ino);
 		ntfs_error(vol->sb, "Failed to load $UsnJrnl.");
-		return FALSE;
+		return false;
 	}
 	vol->usnjrnl_ino = tmp_ino;
 	/*
@@ -1483,7 +1483,7 @@ not_enabled:
 	if (IS_ERR(tmp_ino)) {
 		ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$Max "
 				"attribute.");
-		return FALSE;
+		return false;
 	}
 	vol->usnjrnl_max_ino = tmp_ino;
 	if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) {
@@ -1491,14 +1491,14 @@ not_enabled:
 				"attribute (size is 0x%llx but should be at "
 				"least 0x%zx bytes).", i_size_read(tmp_ino),
 				sizeof(USN_HEADER));
-		return FALSE;
+		return false;
 	}
 	/* Get the $DATA/$J attribute. */
 	tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, J, 2);
 	if (IS_ERR(tmp_ino)) {
 		ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$J "
 				"attribute.");
-		return FALSE;
+		return false;
 	}
 	vol->usnjrnl_j_ino = tmp_ino;
 	/* Verify $J is non-resident and sparse. */
@@ -1506,14 +1506,14 @@ not_enabled:
 	if (unlikely(!NInoNonResident(tmp_ni) || !NInoSparse(tmp_ni))) {
 		ntfs_error(vol->sb, "$UsnJrnl/$DATA/$J attribute is resident "
 				"and/or not sparse.");
-		return FALSE;
+		return false;
 	}
 	/* Read the USN_HEADER from $DATA/$Max. */
 	page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0);
 	if (IS_ERR(page)) {
 		ntfs_error(vol->sb, "Failed to read from $UsnJrnl/$DATA/$Max "
 				"attribute.");
-		return FALSE;
+		return false;
 	}
 	uh = (USN_HEADER*)page_address(page);
 	/* Sanity check the $Max. */
@@ -1524,7 +1524,7 @@ not_enabled:
 				(long long)sle64_to_cpu(uh->allocation_delta),
 				(long long)sle64_to_cpu(uh->maximum_size));
 		ntfs_unmap_page(page);
-		return FALSE;
+		return false;
 	}
 	/*
 	 * If the transaction log has been stamped and nothing has been written
@@ -1548,20 +1548,20 @@ not_enabled:
 				(long long)sle64_to_cpu(uh->lowest_valid_usn),
 				i_size_read(vol->usnjrnl_j_ino));
 		ntfs_unmap_page(page);
-		return FALSE;
+		return false;
 	}
 	ntfs_unmap_page(page);
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
 }
 
 /**
  * load_and_init_attrdef - load the attribute definitions table for a volume
  * @vol:	ntfs super block describing device whose attrdef to load
  *
- * Return TRUE on success or FALSE on error.
+ * Return 'true' on success or 'false' on error.
  */
-static BOOL load_and_init_attrdef(ntfs_volume *vol)
+static bool load_and_init_attrdef(ntfs_volume *vol)
 {
 	loff_t i_size;
 	struct super_block *sb = vol->sb;
@@ -1607,7 +1607,7 @@ read_partial_attrdef_page:
 	vol->attrdef_size = i_size;
 	ntfs_debug("Read %llu bytes from $AttrDef.", i_size);
 	iput(ino);
-	return TRUE;
+	return true;
 free_iput_failed:
 	ntfs_free(vol->attrdef);
 	vol->attrdef = NULL;
@@ -1615,7 +1615,7 @@ iput_failed:
 	iput(ino);
 failed:
 	ntfs_error(sb, "Failed to initialize attribute definition table.");
-	return FALSE;
+	return false;
 }
 
 #endif /* NTFS_RW */
@@ -1624,9 +1624,9 @@ failed:
  * load_and_init_upcase - load the upcase table for an ntfs volume
  * @vol:	ntfs super block describing device whose upcase to load
  *
- * Return TRUE on success or FALSE on error.
+ * Return 'true' on success or 'false' on error.
  */
-static BOOL load_and_init_upcase(ntfs_volume *vol)
+static bool load_and_init_upcase(ntfs_volume *vol)
 {
 	loff_t i_size;
 	struct super_block *sb = vol->sb;
@@ -1682,7 +1682,7 @@ read_partial_upcase_page:
 		ntfs_debug("Using volume specified $UpCase since default is "
 				"not present.");
 		mutex_unlock(&ntfs_lock);
-		return TRUE;
+		return true;
 	}
 	max = default_upcase_len;
 	if (max > vol->upcase_len)
@@ -1698,12 +1698,12 @@ read_partial_upcase_page:
 		mutex_unlock(&ntfs_lock);
 		ntfs_debug("Volume specified $UpCase matches default. Using "
 				"default.");
-		return TRUE;
+		return true;
 	}
 	mutex_unlock(&ntfs_lock);
 	ntfs_debug("Using volume specified $UpCase since it does not match "
 			"the default.");
-	return TRUE;
+	return true;
 iput_upcase_failed:
 	iput(ino);
 	ntfs_free(vol->upcase);
@@ -1717,11 +1717,11 @@ upcase_failed:
 		mutex_unlock(&ntfs_lock);
 		ntfs_error(sb, "Failed to load $UpCase from the volume. Using "
 				"default.");
-		return TRUE;
+		return true;
 	}
 	mutex_unlock(&ntfs_lock);
 	ntfs_error(sb, "Failed to initialize upcase table.");
-	return FALSE;
+	return false;
 }
 
 /*
@@ -1739,9 +1739,9 @@ static struct lock_class_key
  * Open the system files with normal access functions and complete setting up
  * the ntfs super block @vol.
  *
- * Return TRUE on success or FALSE on error.
+ * Return 'true' on success or 'false' on error.
  */
-static BOOL load_system_files(ntfs_volume *vol)
+static bool load_system_files(ntfs_volume *vol)
 {
 	struct super_block *sb = vol->sb;
 	MFT_RECORD *m;
@@ -2067,7 +2067,7 @@ get_ctx_vol_failed:
 #endif /* NTFS_RW */
 	/* If on NTFS versions before 3.0, we are done. */
 	if (unlikely(vol->major_ver < 3))
-		return TRUE;
+		return true;
 	/* NTFS 3.0+ specific initialization. */
 	/* Get the security descriptors inode. */
 	vol->secure_ino = ntfs_iget(sb, FILE_Secure);
@@ -2173,7 +2173,7 @@ get_ctx_vol_failed:
 		NVolSetErrors(vol);
 	}
 #endif /* NTFS_RW */
-	return TRUE;
+	return true;
 #ifdef NTFS_RW
 iput_usnjrnl_err_out:
 	if (vol->usnjrnl_j_ino)
@@ -2229,7 +2229,7 @@ iput_mirr_err_out:
 	if (vol->mftmirr_ino)
 		iput(vol->mftmirr_ino);
 #endif /* NTFS_RW */
-	return FALSE;
+	return false;
 }
 
 /**
@@ -3248,32 +3248,14 @@ ictx_err_out:
 
 static void __exit exit_ntfs_fs(void)
 {
-	int err = 0;
-
 	ntfs_debug("Unregistering NTFS driver.");
 
 	unregister_filesystem(&ntfs_fs_type);
-
-	if (kmem_cache_destroy(ntfs_big_inode_cache) && (err = 1))
-		printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
-				ntfs_big_inode_cache_name);
-	if (kmem_cache_destroy(ntfs_inode_cache) && (err = 1))
-		printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
-				ntfs_inode_cache_name);
-	if (kmem_cache_destroy(ntfs_name_cache) && (err = 1))
-		printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
-				ntfs_name_cache_name);
-	if (kmem_cache_destroy(ntfs_attr_ctx_cache) && (err = 1))
-		printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
-				ntfs_attr_ctx_cache_name);
-	if (kmem_cache_destroy(ntfs_index_ctx_cache) && (err = 1))
-		printk(KERN_CRIT "NTFS: Failed to destory %s.\n",
-				ntfs_index_ctx_cache_name);
-	if (err)
-		printk(KERN_CRIT "NTFS: This causes memory to leak! There is "
-				"probably a BUG in the driver! Please report "
-				"you saw this message to "
-				"linux-ntfs-dev@lists.sourceforge.net\n");
+	kmem_cache_destroy(ntfs_big_inode_cache);
+	kmem_cache_destroy(ntfs_inode_cache);
+	kmem_cache_destroy(ntfs_name_cache);
+	kmem_cache_destroy(ntfs_attr_ctx_cache);
+	kmem_cache_destroy(ntfs_index_ctx_cache);
 	/* Unregister the ntfs sysctls. */
 	ntfs_sysctl(0);
 }
diff --git a/fs/ntfs/types.h b/fs/ntfs/types.h
index 6e4a7e3343f2..8c8053b66984 100644
--- a/fs/ntfs/types.h
+++ b/fs/ntfs/types.h
@@ -62,11 +62,6 @@ typedef s64 USN;
 typedef sle64 leUSN;
 
 typedef enum {
-	FALSE = 0,
-	TRUE = 1
-} BOOL;
-
-typedef enum {
 	CASE_SENSITIVE = 0,
 	IGNORE_CASE = 1,
 } IGNORE_CASE_BOOL;
diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c
index b123c0fa6bf6..6a495f7369f9 100644
--- a/fs/ntfs/unistr.c
+++ b/fs/ntfs/unistr.c
@@ -61,16 +61,16 @@ static const u8 legal_ansi_char_array[0x40] = {
  * @upcase:		upcase table (only if @ic == IGNORE_CASE)
  * @upcase_size:	length in Unicode characters of @upcase (if present)
  *
- * Compare the names @s1 and @s2 and return TRUE (1) if the names are
- * identical, or FALSE (0) if they are not identical. If @ic is IGNORE_CASE,
+ * Compare the names @s1 and @s2 and return 'true' (1) if the names are
+ * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE,
  * the @upcase table is used to performa a case insensitive comparison.
  */
-BOOL ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
+bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
 		const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic,
 		const ntfschar *upcase, const u32 upcase_size)
 {
 	if (s1_len != s2_len)
-		return FALSE;
+		return false;
 	if (ic == CASE_SENSITIVE)
 		return !ntfs_ucsncmp(s1, s2, s1_len);
 	return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size);
@@ -350,7 +350,7 @@ int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
 		}
 		if (!ns) {
 			ns_len = ins_len * NLS_MAX_CHARSET_SIZE;
-			ns = (unsigned char*)kmalloc(ns_len + 1, GFP_NOFS);
+			ns = kmalloc(ns_len + 1, GFP_NOFS);
 			if (!ns)
 				goto mem_err_out;
 		}
@@ -365,7 +365,7 @@ retry:			wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o,
 			else if (wc == -ENAMETOOLONG && ns != *outs) {
 				unsigned char *tc;
 				/* Grow in multiples of 64 bytes. */
-				tc = (unsigned char*)kmalloc((ns_len + 64) &
+				tc = kmalloc((ns_len + 64) &
 						~63, GFP_NOFS);
 				if (tc) {
 					memcpy(tc, ns, ns_len);
diff --git a/fs/ntfs/usnjrnl.c b/fs/ntfs/usnjrnl.c
index 77773240d139..b2bc0d55b036 100644
--- a/fs/ntfs/usnjrnl.c
+++ b/fs/ntfs/usnjrnl.c
@@ -39,12 +39,12 @@
  * @vol:	ntfs volume on which to stamp the transaction log
  *
  * Stamp the transaction log ($UsnJrnl) on the ntfs volume @vol and return
- * TRUE on success and FALSE on error.
+ * 'true' on success and 'false' on error.
  *
  * This function assumes that the transaction log has already been loaded and
  * consistency checked by a call to fs/ntfs/super.c::load_and_init_usnjrnl().
  */
-BOOL ntfs_stamp_usnjrnl(ntfs_volume *vol)
+bool ntfs_stamp_usnjrnl(ntfs_volume *vol)
 {
 	ntfs_debug("Entering.");
 	if (likely(!NVolUsnJrnlStamped(vol))) {
@@ -56,7 +56,7 @@ BOOL ntfs_stamp_usnjrnl(ntfs_volume *vol)
 		if (IS_ERR(page)) {
 			ntfs_error(vol->sb, "Failed to read from "
 					"$UsnJrnl/$DATA/$Max attribute.");
-			return FALSE;
+			return false;
 		}
 		uh = (USN_HEADER*)page_address(page);
 		stamp = get_current_ntfs_time();
@@ -78,7 +78,7 @@ BOOL ntfs_stamp_usnjrnl(ntfs_volume *vol)
 		NVolSetUsnJrnlStamped(vol);
 	}
 	ntfs_debug("Done.");
-	return TRUE;
+	return true;
 }
 
 #endif /* NTFS_RW */
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
index ff988b0deb45..3a8af75351e8 100644
--- a/fs/ntfs/usnjrnl.h
+++ b/fs/ntfs/usnjrnl.h
@@ -198,7 +198,7 @@ typedef struct {
 /* sizeof() = 60 (0x3c) bytes */
 } __attribute__ ((__packed__)) USN_RECORD;
 
-extern BOOL ntfs_stamp_usnjrnl(ntfs_volume *vol);
+extern bool ntfs_stamp_usnjrnl(ntfs_volume *vol);
 
 #endif /* NTFS_RW */
 
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index e1fceb8aa32d..d11753c50bc1 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -152,14 +152,16 @@ static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster,
 	struct o2nm_node *node, *ret = NULL;
 
 	while (*p) {
+		int cmp;
+
 		parent = *p;
 		node = rb_entry(parent, struct o2nm_node, nd_ip_node);
 
-		if (memcmp(&ip_needle, &node->nd_ipv4_address,
-		           sizeof(ip_needle)) < 0)
+		cmp = memcmp(&ip_needle, &node->nd_ipv4_address,
+				sizeof(ip_needle));
+		if (cmp < 0)
 			p = &(*p)->rb_left;
-		else if (memcmp(&ip_needle, &node->nd_ipv4_address,
-			        sizeof(ip_needle)) > 0)
+		else if (cmp > 0)
 			p = &(*p)->rb_right;
 		else {
 			ret = node;
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 033ad1701232..16b8d1ba7066 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -335,11 +335,10 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 		inode->i_mode = mode;
 		inode->i_uid = current->fsuid;
 		inode->i_gid = current->fsgid;
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-		inode->i_nlink++;
+		inc_nlink(inode);
 
 		inode->i_fop = &simple_dir_operations;
 		inode->i_op = &dlmfs_root_inode_operations;
@@ -362,7 +361,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
 	inode->i_mode = mode;
 	inode->i_uid = current->fsuid;
 	inode->i_gid = current->fsgid;
-	inode->i_blksize = PAGE_CACHE_SIZE;
 	inode->i_blocks = 0;
 	inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -397,7 +395,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
 
 		/* directory inodes start off with i_nlink ==
 		 * 2 (for "." entry) */
-		inode->i_nlink++;
+		inc_nlink(inode);
 		break;
 	}
 
@@ -451,7 +449,7 @@ static int dlmfs_mkdir(struct inode * dir,
 	}
 	ip->ip_dlm = dlm;
 
-	dir->i_nlink++;
+	inc_nlink(dir);
 	d_instantiate(dentry, inode);
 	dget(dentry);	/* Extra count - pin the dentry in core */
 
@@ -629,9 +627,7 @@ static void __exit exit_dlmfs_fs(void)
 	flush_workqueue(user_dlm_worker);
 	destroy_workqueue(user_dlm_worker);
 
-	if (kmem_cache_destroy(dlmfs_inode_cache))
-		printk(KERN_INFO "dlmfs_inode_cache: not all structures "
-		       "were freed\n");
+	kmem_cache_destroy(dlmfs_inode_cache);
 }
 
 MODULE_AUTHOR("Oracle");
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index de887063dcfc..8801e41afe80 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2052,7 +2052,7 @@ static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
 		mlog_errno(ret);
 		goto out;
 	}
-	osb = (struct ocfs2_super *) inode->u.generic_ip;
+	osb = inode->i_private;
 	ocfs2_get_dlm_debug(osb->osb_dlm_debug);
 	priv->p_dlm_debug = osb->osb_dlm_debug;
 	INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 2bbfa17090cf..1be74c4e7814 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -30,6 +30,7 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/uio.h>
+#include <linux/sched.h>
 
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -691,6 +692,12 @@ static int ocfs2_zero_extend(struct inode *inode,
 		}
 
 		start_off += sb->s_blocksize;
+
+		/*
+		 * Very large extends have the potential to lock up
+		 * the cpu for extended periods of time.
+		 */
+		cond_resched();
 	}
 
 out:
@@ -728,31 +735,36 @@ static int ocfs2_extend_file(struct inode *inode,
 	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 
 		OCFS2_I(inode)->ip_clusters;
 
-	if (clusters_to_add) {
-		/* 
-		 * protect the pages that ocfs2_zero_extend is going to
-		 * be pulling into the page cache.. we do this before the
-		 * metadata extend so that we don't get into the situation
-		 * where we've extended the metadata but can't get the data
-		 * lock to zero.
-		 */
-		ret = ocfs2_data_lock(inode, 1);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
+	/* 
+	 * protect the pages that ocfs2_zero_extend is going to be
+	 * pulling into the page cache.. we do this before the
+	 * metadata extend so that we don't get into the situation
+	 * where we've extended the metadata but can't get the data
+	 * lock to zero.
+	 */
+	ret = ocfs2_data_lock(inode, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
 
+	if (clusters_to_add) {
 		ret = ocfs2_extend_allocation(inode, clusters_to_add);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out_unlock;
 		}
+	}
 
-		ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out_unlock;
-		}
+	/*
+	 * Call this even if we don't add any clusters to the tree. We
+	 * still need to zero the area between the old i_size and the
+	 * new i_size.
+	 */
+	ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_unlock;
 	}
 
 	if (!tail_to_skip) {
@@ -764,8 +776,7 @@ static int ocfs2_extend_file(struct inode *inode,
 	}
 
 out_unlock:
-	if (clusters_to_add) /* this is the only case in which we lock */
-		ocfs2_data_unlock(inode, 1);
+	ocfs2_data_unlock(inode, 1);
 
 out:
 	return ret;
@@ -961,25 +972,23 @@ static inline int ocfs2_write_should_remove_suid(struct inode *inode)
 }
 
 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
-				    const char __user *buf,
-				    size_t count,
+				    const struct iovec *iov,
+				    unsigned long nr_segs,
 				    loff_t pos)
 {
-	struct iovec local_iov = { .iov_base = (void __user *)buf,
-				   .iov_len = count };
 	int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0;
 	u32 clusters;
 	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_dentry->d_inode;
 	loff_t newsize, saved_pos;
 
-	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
-		   (unsigned int)count,
+	mlog_entry("(0x%p, %u, '%.*s')\n", filp,
+		   (unsigned int)nr_segs,
 		   filp->f_dentry->d_name.len,
 		   filp->f_dentry->d_name.name);
 
 	/* happy write of zero bytes */
-	if (count == 0)
+	if (iocb->ki_left == 0)
 		return 0;
 
 	if (!inode) {
@@ -1048,7 +1057,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 		} else {
 			saved_pos = iocb->ki_pos;
 		}
-		newsize = count + saved_pos;
+		newsize = iocb->ki_left + saved_pos;
 
 		mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
 		     (long long) saved_pos, (long long) newsize,
@@ -1081,7 +1090,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 		if (!clusters)
 			break;
 
-		ret = ocfs2_extend_file(inode, NULL, newsize, count);
+		ret = ocfs2_extend_file(inode, NULL, newsize, iocb->ki_left);
 		if (ret < 0) {
 			if (ret != -ENOSPC)
 				mlog_errno(ret);
@@ -1098,7 +1107,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	/* communicate with ocfs2_dio_end_io */
 	ocfs2_iocb_set_rw_locked(iocb);
 
-	ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
+	ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos);
 
 	/* buffered aio wouldn't have proper lock coverage today */
 	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
@@ -1132,16 +1141,16 @@ out:
 }
 
 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
-				   char __user *buf,
-				   size_t count,
+				   const struct iovec *iov,
+				   unsigned long nr_segs,
 				   loff_t pos)
 {
 	int ret = 0, rw_level = -1, have_alloc_sem = 0;
 	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_dentry->d_inode;
 
-	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
-		   (unsigned int)count,
+	mlog_entry("(0x%p, %u, '%.*s')\n", filp,
+		   (unsigned int)nr_segs,
 		   filp->f_dentry->d_name.len,
 		   filp->f_dentry->d_name.name);
 
@@ -1185,7 +1194,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 	}
 	ocfs2_meta_unlock(inode, 0);
 
-	ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
+	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
 	if (ret == -EINVAL)
 		mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
 
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 69d3db569166..16e8e74dc966 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -269,7 +269,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	inode->i_mode = le16_to_cpu(fe->i_mode);
 	inode->i_uid = le32_to_cpu(fe->i_uid);
 	inode->i_gid = le32_to_cpu(fe->i_gid);
-	inode->i_blksize = (u32)osb->s_clustersize;
 
 	/* Fast symlinks will have i_size but no allocated clusters. */
 	if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
@@ -1258,8 +1257,6 @@ leave:
 void ocfs2_refresh_inode(struct inode *inode,
 			 struct ocfs2_dinode *fe)
 {
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 
 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
@@ -1270,7 +1267,6 @@ void ocfs2_refresh_inode(struct inode *inode,
 	inode->i_uid = le32_to_cpu(fe->i_uid);
 	inode->i_gid = le32_to_cpu(fe->i_gid);
 	inode->i_mode = le16_to_cpu(fe->i_mode);
-	inode->i_blksize = (u32) osb->s_clustersize;
 	if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
 		inode->i_blocks = 0;
 	else
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 849c3b4bb94a..a57b751d4f40 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -429,7 +429,7 @@ static int ocfs2_mknod(struct inode *dir,
 			mlog_errno(status);
 			goto leave;
 		}
-		dir->i_nlink++;
+		inc_nlink(dir);
 	}
 
 	status = ocfs2_add_entry(handle, dentry, inode,
@@ -730,7 +730,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 		goto bail;
 	}
 
-	inode->i_nlink++;
+	inc_nlink(inode);
 	inode->i_ctime = CURRENT_TIME;
 	fe->i_links_count = cpu_to_le16(inode->i_nlink);
 	fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
@@ -739,7 +739,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 	err = ocfs2_journal_dirty(handle, fe_bh);
 	if (err < 0) {
 		le16_add_cpu(&fe->i_links_count, -1);
-		inode->i_nlink--;
+		drop_nlink(inode);
 		mlog_errno(err);
 		goto bail;
 	}
@@ -749,7 +749,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 			      parent_fe_bh, de_bh);
 	if (err) {
 		le16_add_cpu(&fe->i_links_count, -1);
-		inode->i_nlink--;
+		drop_nlink(inode);
 		mlog_errno(err);
 		goto bail;
 	}
@@ -795,11 +795,23 @@ static int ocfs2_remote_dentry_delete(struct dentry *dentry)
 	return ret;
 }
 
+static inline int inode_is_unlinkable(struct inode *inode)
+{
+	if (S_ISDIR(inode->i_mode)) {
+		if (inode->i_nlink == 2)
+			return 1;
+		return 0;
+	}
+
+	if (inode->i_nlink == 1)
+		return 1;
+	return 0;
+}
+
 static int ocfs2_unlink(struct inode *dir,
 			struct dentry *dentry)
 {
 	int status;
-	unsigned int saved_nlink = 0;
 	struct inode *inode = dentry->d_inode;
 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
 	u64 blkno;
@@ -874,16 +886,6 @@ static int ocfs2_unlink(struct inode *dir,
 		}
 	}
 
-	/* There are still a few steps left until we can consider the
-	 * unlink to have succeeded. Save off nlink here before
-	 * modification so we can set it back in case we hit an issue
-	 * before commit. */
-	saved_nlink = inode->i_nlink;
-	if (S_ISDIR(inode->i_mode))
-		inode->i_nlink = 0;
-	else
-		inode->i_nlink--;
-
 	status = ocfs2_remote_dentry_delete(dentry);
 	if (status < 0) {
 		/* This vote should succeed under all normal
@@ -892,7 +894,7 @@ static int ocfs2_unlink(struct inode *dir,
 		goto leave;
 	}
 
-	if (!inode->i_nlink) {
+	if (inode_is_unlinkable(inode)) {
 		status = ocfs2_prepare_orphan_dir(osb, handle, inode,
 						  orphan_name,
 						  &orphan_entry_bh);
@@ -919,7 +921,7 @@ static int ocfs2_unlink(struct inode *dir,
 
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
-	if (!inode->i_nlink) {
+	if (inode_is_unlinkable(inode)) {
 		status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name,
 					  orphan_entry_bh);
 		if (status < 0) {
@@ -935,10 +937,10 @@ static int ocfs2_unlink(struct inode *dir,
 		goto leave;
 	}
 
-	/* We can set nlink on the dinode now. clear the saved version
-	 * so that it doesn't get set later. */
+	if (S_ISDIR(inode->i_mode))
+		drop_nlink(inode);
+	drop_nlink(inode);
 	fe->i_links_count = cpu_to_le16(inode->i_nlink);
-	saved_nlink = 0;
 
 	status = ocfs2_journal_dirty(handle, fe_bh);
 	if (status < 0) {
@@ -947,19 +949,16 @@ static int ocfs2_unlink(struct inode *dir,
 	}
 
 	if (S_ISDIR(inode->i_mode)) {
-		dir->i_nlink--;
+		drop_nlink(dir);
 		status = ocfs2_mark_inode_dirty(handle, dir,
 						parent_node_bh);
 		if (status < 0) {
 			mlog_errno(status);
-			dir->i_nlink++;
+			inc_nlink(dir);
 		}
 	}
 
 leave:
-	if (status < 0 && saved_nlink)
-		inode->i_nlink = saved_nlink;
-
 	if (handle)
 		ocfs2_commit_trans(handle);
 
@@ -1086,14 +1085,6 @@ static int ocfs2_rename(struct inode *old_dir,
 			BUG();
 	}
 
-	if (atomic_read(&old_dentry->d_count) > 2) {
-		shrink_dcache_parent(old_dentry);
-		if (atomic_read(&old_dentry->d_count) > 2) {
-			status = -EBUSY;
-			goto bail;
-		}
-	}
-
 	/* Assume a directory heirarchy thusly:
 	 * a/b/c
 	 * a/d
@@ -1382,7 +1373,7 @@ static int ocfs2_rename(struct inode *old_dir,
 		if (new_inode) {
 			new_inode->i_nlink--;
 		} else {
-			new_dir->i_nlink++;
+			inc_nlink(new_dir);
 			mark_inode_dirty(new_dir);
 		}
 	}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4c29cd7cc8e6..76b46ebbb10c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -339,7 +339,7 @@ static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
 
 #if BITS_PER_LONG == 32
 # if defined(CONFIG_LBD)
-	BUG_ON(sizeof(sector_t) != 8);
+	BUILD_BUG_ON(sizeof(sector_t) != 8);
 	pagefactor = PAGE_CACHE_SIZE;
 	bitshift = BITS_PER_LONG;
 # else
diff --git a/fs/open.c b/fs/open.c
index 303f06d2a7b9..89e0c237a636 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -6,7 +6,6 @@
 
 #include <linux/string.h>
 #include <linux/mm.h>
-#include <linux/utime.h>
 #include <linux/file.h>
 #include <linux/smp_lock.h>
 #include <linux/quotaops.h>
@@ -29,8 +28,6 @@
 #include <linux/rcupdate.h>
 #include <linux/audit.h>
 
-#include <asm/unistd.h>
-
 int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	int retval = -ENODEV;
@@ -353,137 +350,6 @@ asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length)
 }
 #endif
 
-#ifdef __ARCH_WANT_SYS_UTIME
-
-/*
- * sys_utime() can be implemented in user-level using sys_utimes().
- * Is this for backwards compatibility?  If so, why not move it
- * into the appropriate arch directory (for those architectures that
- * need it).
- */
-
-/* If times==NULL, set access and modification to current time,
- * must be owner or have write permission.
- * Else, update from *times, must be owner or super user.
- */
-asmlinkage long sys_utime(char __user * filename, struct utimbuf __user * times)
-{
-	int error;
-	struct nameidata nd;
-	struct inode * inode;
-	struct iattr newattrs;
-
-	error = user_path_walk(filename, &nd);
-	if (error)
-		goto out;
-	inode = nd.dentry->d_inode;
-
-	error = -EROFS;
-	if (IS_RDONLY(inode))
-		goto dput_and_out;
-
-	/* Don't worry, the checks are done in inode_change_ok() */
-	newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
-	if (times) {
-		error = -EPERM;
-		if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-			goto dput_and_out;
-
-		error = get_user(newattrs.ia_atime.tv_sec, &times->actime);
-		newattrs.ia_atime.tv_nsec = 0;
-		if (!error)
-			error = get_user(newattrs.ia_mtime.tv_sec, &times->modtime);
-		newattrs.ia_mtime.tv_nsec = 0;
-		if (error)
-			goto dput_and_out;
-
-		newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
-	} else {
-                error = -EACCES;
-                if (IS_IMMUTABLE(inode))
-                        goto dput_and_out;
-
-		if (current->fsuid != inode->i_uid &&
-		    (error = vfs_permission(&nd, MAY_WRITE)) != 0)
-			goto dput_and_out;
-	}
-	mutex_lock(&inode->i_mutex);
-	error = notify_change(nd.dentry, &newattrs);
-	mutex_unlock(&inode->i_mutex);
-dput_and_out:
-	path_release(&nd);
-out:
-	return error;
-}
-
-#endif
-
-/* If times==NULL, set access and modification to current time,
- * must be owner or have write permission.
- * Else, update from *times, must be owner or super user.
- */
-long do_utimes(int dfd, char __user *filename, struct timeval *times)
-{
-	int error;
-	struct nameidata nd;
-	struct inode * inode;
-	struct iattr newattrs;
-
-	error = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd);
-
-	if (error)
-		goto out;
-	inode = nd.dentry->d_inode;
-
-	error = -EROFS;
-	if (IS_RDONLY(inode))
-		goto dput_and_out;
-
-	/* Don't worry, the checks are done in inode_change_ok() */
-	newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
-	if (times) {
-		error = -EPERM;
-                if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-                        goto dput_and_out;
-
-		newattrs.ia_atime.tv_sec = times[0].tv_sec;
-		newattrs.ia_atime.tv_nsec = times[0].tv_usec * 1000;
-		newattrs.ia_mtime.tv_sec = times[1].tv_sec;
-		newattrs.ia_mtime.tv_nsec = times[1].tv_usec * 1000;
-		newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
-	} else {
-		error = -EACCES;
-                if (IS_IMMUTABLE(inode))
-                        goto dput_and_out;
-
-		if (current->fsuid != inode->i_uid &&
-		    (error = vfs_permission(&nd, MAY_WRITE)) != 0)
-			goto dput_and_out;
-	}
-	mutex_lock(&inode->i_mutex);
-	error = notify_change(nd.dentry, &newattrs);
-	mutex_unlock(&inode->i_mutex);
-dput_and_out:
-	path_release(&nd);
-out:
-	return error;
-}
-
-asmlinkage long sys_futimesat(int dfd, char __user *filename, struct timeval __user *utimes)
-{
-	struct timeval times[2];
-
-	if (utimes && copy_from_user(&times, utimes, sizeof(times)))
-		return -EFAULT;
-	return do_utimes(dfd, filename, utimes ? times : NULL);
-}
-
-asmlinkage long sys_utimes(char __user *filename, struct timeval __user *utimes)
-{
-	return sys_futimesat(AT_FDCWD, filename, utimes);
-}
-
-
 /*
  * access() needs to use the real uid/gid, not the effective uid/gid.
  * We do this by temporarily clearing all FS-related capabilities and
@@ -520,15 +386,21 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 		current->cap_effective = current->cap_permitted;
 
 	res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
-	if (!res) {
-		res = vfs_permission(&nd, mode);
-		/* SuS v2 requires we report a read only fs too */
-		if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode)
-		   && !special_file(nd.dentry->d_inode->i_mode))
-			res = -EROFS;
-		path_release(&nd);
-	}
+	if (res)
+		goto out;
+
+	res = vfs_permission(&nd, mode);
+	/* SuS v2 requires we report a read only fs too */
+	if(res || !(mode & S_IWOTH) ||
+	   special_file(nd.dentry->d_inode->i_mode))
+		goto out_path_release;
+
+	if(IS_RDONLY(nd.dentry->d_inode))
+		res = -EROFS;
 
+out_path_release:
+	path_release(&nd);
+out:
 	current->fsuid = old_fsuid;
 	current->fsgid = old_fsgid;
 	current->cap_effective = old_cap;
@@ -546,7 +418,8 @@ asmlinkage long sys_chdir(const char __user * filename)
 	struct nameidata nd;
 	int error;
 
-	error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd);
+	error = __user_walk(filename,
+			    LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_CHDIR, &nd);
 	if (error)
 		goto out;
 
@@ -736,10 +609,11 @@ asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group)
 	int error;
 
 	error = user_path_walk(filename, &nd);
-	if (!error) {
-		error = chown_common(nd.dentry, user, group);
-		path_release(&nd);
-	}
+	if (error)
+		goto out;
+	error = chown_common(nd.dentry, user, group);
+	path_release(&nd);
+out:
 	return error;
 }
 
@@ -755,10 +629,10 @@ asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
 
 	follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
 	error = __user_walk_fd(dfd, filename, follow, &nd);
-	if (!error) {
-		error = chown_common(nd.dentry, user, group);
-		path_release(&nd);
-	}
+	if (error)
+		goto out;
+	error = chown_common(nd.dentry, user, group);
+	path_release(&nd);
 out:
 	return error;
 }
@@ -769,10 +643,11 @@ asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group
 	int error;
 
 	error = user_path_walk_link(filename, &nd);
-	if (!error) {
-		error = chown_common(nd.dentry, user, group);
-		path_release(&nd);
-	}
+	if (error)
+		goto out;
+	error = chown_common(nd.dentry, user, group);
+	path_release(&nd);
+out:
 	return error;
 }
 
@@ -781,15 +656,17 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group)
 {
 	struct file * file;
 	int error = -EBADF;
+	struct dentry * dentry;
 
 	file = fget(fd);
-	if (file) {
-		struct dentry * dentry;
-		dentry = file->f_dentry;
-		audit_inode(NULL, dentry->d_inode);
-		error = chown_common(dentry, user, group);
-		fput(file);
-	}
+	if (!file)
+		goto out;
+
+	dentry = file->f_dentry;
+	audit_inode(NULL, dentry->d_inode);
+	error = chown_common(dentry, user, group);
+	fput(file);
+out:
 	return error;
 }
 
@@ -1172,6 +1049,7 @@ asmlinkage long sys_close(unsigned int fd)
 	struct file * filp;
 	struct files_struct *files = current->files;
 	struct fdtable *fdt;
+	int retval;
 
 	spin_lock(&files->file_lock);
 	fdt = files_fdtable(files);
@@ -1184,7 +1062,16 @@ asmlinkage long sys_close(unsigned int fd)
 	FD_CLR(fd, fdt->close_on_exec);
 	__put_unused_fd(files, fd);
 	spin_unlock(&files->file_lock);
-	return filp_close(filp, files);
+	retval = filp_close(filp, files);
+
+	/* can't restart close syscall because file table entry was cleared */
+	if (unlikely(retval == -ERESTARTSYS ||
+		     retval == -ERESTARTNOINTR ||
+		     retval == -ERESTARTNOHAND ||
+		     retval == -ERESTART_RESTARTBLOCK))
+		retval = -EINTR;
+
+	return retval;
 
 out_unlock:
 	spin_unlock(&files->file_lock);
diff --git a/fs/partitions/Makefile b/fs/partitions/Makefile
index d713ce6b3e12..67e665fdb7fc 100644
--- a/fs/partitions/Makefile
+++ b/fs/partitions/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the linux kernel.
 #
 
-obj-y := check.o
+obj-$(CONFIG_BLOCK) := check.o
 
 obj-$(CONFIG_ACORN_PARTITION) += acorn.o
 obj-$(CONFIG_AMIGA_PARTITION) += amiga.o
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 51c6a748df49..6fb4b6150d77 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -376,18 +376,48 @@ static char *make_block_name(struct gendisk *disk)
 	return name;
 }
 
-static void disk_sysfs_symlinks(struct gendisk *disk)
+static int disk_sysfs_symlinks(struct gendisk *disk)
 {
 	struct device *target = get_device(disk->driverfs_dev);
+	int err;
+	char *disk_name = NULL;
+
 	if (target) {
-		char *disk_name = make_block_name(disk);
-		sysfs_create_link(&disk->kobj,&target->kobj,"device");
-		if (disk_name) {
-			sysfs_create_link(&target->kobj,&disk->kobj,disk_name);
-			kfree(disk_name);
+		disk_name = make_block_name(disk);
+		if (!disk_name) {
+			err = -ENOMEM;
+			goto err_out;
 		}
+
+		err = sysfs_create_link(&disk->kobj, &target->kobj, "device");
+		if (err)
+			goto err_out_disk_name;
+
+		err = sysfs_create_link(&target->kobj, &disk->kobj, disk_name);
+		if (err)
+			goto err_out_dev_link;
 	}
-	sysfs_create_link(&disk->kobj, &block_subsys.kset.kobj, "subsystem");
+
+	err = sysfs_create_link(&disk->kobj, &block_subsys.kset.kobj,
+				"subsystem");
+	if (err)
+		goto err_out_disk_name_lnk;
+
+	kfree(disk_name);
+
+	return 0;
+
+err_out_disk_name_lnk:
+	if (target) {
+		sysfs_remove_link(&target->kobj, disk_name);
+err_out_dev_link:
+		sysfs_remove_link(&disk->kobj, "device");
+err_out_disk_name:
+		kfree(disk_name);
+err_out:
+		put_device(target);
+	}
+	return err;
 }
 
 /* Not exported, helper to add_disk(). */
@@ -406,7 +436,11 @@ void register_disk(struct gendisk *disk)
 		*s = '!';
 	if ((err = kobject_add(&disk->kobj)))
 		return;
-	disk_sysfs_symlinks(disk);
+	err = disk_sysfs_symlinks(disk);
+	if (err) {
+		kobject_del(&disk->kobj);
+		return;
+	}
  	disk_sysfs_add_subdirs(disk);
 
 	/* No minors to use for partitions */
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 63730282ad81..1bea610078b3 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -238,10 +238,9 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
                 le32_to_cpu(gpt->sizeof_partition_entry);
 	if (!count)
 		return NULL;
-	pte = kmalloc(count, GFP_KERNEL);
+	pte = kzalloc(count, GFP_KERNEL);
 	if (!pte)
 		return NULL;
-	memset(pte, 0, count);
 
 	if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba),
                      (u8 *) pte,
@@ -269,10 +268,9 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
 	if (!bdev)
 		return NULL;
 
-	gpt = kmalloc(sizeof (gpt_header), GFP_KERNEL);
+	gpt = kzalloc(sizeof (gpt_header), GFP_KERNEL);
 	if (!gpt)
 		return NULL;
-	memset(gpt, 0, sizeof (gpt_header));
 
 	if (read_lba(bdev, lba, (u8 *) gpt,
 		     sizeof (gpt_header)) < sizeof (gpt_header)) {
@@ -526,9 +524,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
 	lastlba = last_lba(bdev);
         if (!force_gpt) {
                 /* This will be added to the EFI Spec. per Intel after v1.02. */
-                legacymbr = kmalloc(sizeof (*legacymbr), GFP_KERNEL);
+                legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
                 if (legacymbr) {
-                        memset(legacymbr, 0, sizeof (*legacymbr));
                         read_lba(bdev, 0, (u8 *) legacymbr,
                                  sizeof (*legacymbr));
                         good_pmbr = is_pmbr_valid(legacymbr, lastlba);
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 7ab1c11dca4e..1a60926a4ccd 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -30,11 +30,6 @@
 #include "check.h"
 #include "msdos.h"
 
-typedef enum {
-	FALSE = 0,
-	TRUE  = 1
-} BOOL;
-
 /**
  * ldm_debug/info/error/crit - Output an error message
  * @f:    A printf format string containing the message
@@ -103,24 +98,24 @@ static int ldm_parse_hexbyte (const u8 *src)
  *
  * N.B. The GUID need not be NULL terminated.
  *
- * Return:  TRUE   @dest contains binary GUID
- *          FALSE  @dest contents are undefined
+ * Return:  'true'   @dest contains binary GUID
+ *          'false'  @dest contents are undefined
  */
-static BOOL ldm_parse_guid (const u8 *src, u8 *dest)
+static bool ldm_parse_guid (const u8 *src, u8 *dest)
 {
 	static const int size[] = { 4, 2, 2, 2, 6 };
 	int i, j, v;
 
 	if (src[8]  != '-' || src[13] != '-' ||
 	    src[18] != '-' || src[23] != '-')
-		return FALSE;
+		return false;
 
 	for (j = 0; j < 5; j++, src++)
 		for (i = 0; i < size[j]; i++, src+=2, *dest++ = v)
 			if ((v = ldm_parse_hexbyte (src)) < 0)
-				return FALSE;
+				return false;
 
-	return TRUE;
+	return true;
 }
 
 
@@ -132,17 +127,17 @@ static BOOL ldm_parse_guid (const u8 *src, u8 *dest)
  * This parses the LDM database PRIVHEAD structure supplied in @data and
  * sets up the in-memory privhead structure @ph with the obtained information.
  *
- * Return:  TRUE   @ph contains the PRIVHEAD data
- *          FALSE  @ph contents are undefined
+ * Return:  'true'   @ph contains the PRIVHEAD data
+ *          'false'  @ph contents are undefined
  */
-static BOOL ldm_parse_privhead (const u8 *data, struct privhead *ph)
+static bool ldm_parse_privhead (const u8 *data, struct privhead *ph)
 {
 	BUG_ON (!data || !ph);
 
 	if (MAGIC_PRIVHEAD != BE64 (data)) {
 		ldm_error ("Cannot find PRIVHEAD structure. LDM database is"
 			" corrupt. Aborting.");
-		return FALSE;
+		return false;
 	}
 
 	ph->ver_major          = BE16 (data + 0x000C);
@@ -155,7 +150,7 @@ static BOOL ldm_parse_privhead (const u8 *data, struct privhead *ph)
 	if ((ph->ver_major != 2) || (ph->ver_minor != 11)) {
 		ldm_error ("Expected PRIVHEAD version %d.%d, got %d.%d."
 			" Aborting.", 2, 11, ph->ver_major, ph->ver_minor);
-		return FALSE;
+		return false;
 	}
 	if (ph->config_size != LDM_DB_SIZE) {	/* 1 MiB in sectors. */
 		/* Warn the user and continue, carefully */
@@ -166,16 +161,16 @@ static BOOL ldm_parse_privhead (const u8 *data, struct privhead *ph)
 	if ((ph->logical_disk_size == 0) ||
 	    (ph->logical_disk_start + ph->logical_disk_size > ph->config_start)) {
 		ldm_error ("PRIVHEAD disk size doesn't match real disk size");
-		return FALSE;
+		return false;
 	}
 
 	if (!ldm_parse_guid (data + 0x0030, ph->disk_id)) {
 		ldm_error ("PRIVHEAD contains an invalid GUID.");
-		return FALSE;
+		return false;
 	}
 
 	ldm_debug ("Parsed PRIVHEAD successfully.");
-	return TRUE;
+	return true;
 }
 
 /**
@@ -189,16 +184,16 @@ static BOOL ldm_parse_privhead (const u8 *data, struct privhead *ph)
  *
  * N.B.  The *_start and *_size values returned in @toc are not range-checked.
  *
- * Return:  TRUE   @toc contains the TOCBLOCK data
- *          FALSE  @toc contents are undefined
+ * Return:  'true'   @toc contains the TOCBLOCK data
+ *          'false'  @toc contents are undefined
  */
-static BOOL ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
+static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
 {
 	BUG_ON (!data || !toc);
 
 	if (MAGIC_TOCBLOCK != BE64 (data)) {
 		ldm_crit ("Cannot find TOCBLOCK, database may be corrupt.");
-		return FALSE;
+		return false;
 	}
 	strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name));
 	toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0;
@@ -209,7 +204,7 @@ static BOOL ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
 			sizeof (toc->bitmap1_name)) != 0) {
 		ldm_crit ("TOCBLOCK's first bitmap is '%s', should be '%s'.",
 				TOC_BITMAP1, toc->bitmap1_name);
-		return FALSE;
+		return false;
 	}
 	strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name));
 	toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0;
@@ -219,10 +214,10 @@ static BOOL ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
 			sizeof (toc->bitmap2_name)) != 0) {
 		ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.",
 				TOC_BITMAP2, toc->bitmap2_name);
-		return FALSE;
+		return false;
 	}
 	ldm_debug ("Parsed TOCBLOCK successfully.");
-	return TRUE;
+	return true;
 }
 
 /**
@@ -235,16 +230,16 @@ static BOOL ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
  *
  * N.B.  The *_start, *_size and *_seq values will be range-checked later.
  *
- * Return:  TRUE   @vm contains VMDB info
- *          FALSE  @vm contents are undefined
+ * Return:  'true'   @vm contains VMDB info
+ *          'false'  @vm contents are undefined
  */
-static BOOL ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
+static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
 {
 	BUG_ON (!data || !vm);
 
 	if (MAGIC_VMDB != BE32 (data)) {
 		ldm_crit ("Cannot find the VMDB, database may be corrupt.");
-		return FALSE;
+		return false;
 	}
 
 	vm->ver_major = BE16 (data + 0x12);
@@ -252,7 +247,7 @@ static BOOL ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
 	if ((vm->ver_major != 4) || (vm->ver_minor != 10)) {
 		ldm_error ("Expected VMDB version %d.%d, got %d.%d. "
 			"Aborting.", 4, 10, vm->ver_major, vm->ver_minor);
-		return FALSE;
+		return false;
 	}
 
 	vm->vblk_size     = BE32 (data + 0x08);
@@ -260,7 +255,7 @@ static BOOL ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
 	vm->last_vblk_seq = BE32 (data + 0x04);
 
 	ldm_debug ("Parsed VMDB successfully.");
-	return TRUE;
+	return true;
 }
 
 /**
@@ -270,10 +265,10 @@ static BOOL ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
  *
  * This compares the two privhead structures @ph1 and @ph2.
  *
- * Return:  TRUE   Identical
- *          FALSE  Different
+ * Return:  'true'   Identical
+ *          'false'  Different
  */
-static BOOL ldm_compare_privheads (const struct privhead *ph1,
+static bool ldm_compare_privheads (const struct privhead *ph1,
 				   const struct privhead *ph2)
 {
 	BUG_ON (!ph1 || !ph2);
@@ -294,10 +289,10 @@ static BOOL ldm_compare_privheads (const struct privhead *ph1,
  *
  * This compares the two tocblock structures @toc1 and @toc2.
  *
- * Return:  TRUE   Identical
- *          FALSE  Different
+ * Return:  'true'   Identical
+ *          'false'  Different
  */
-static BOOL ldm_compare_tocblocks (const struct tocblock *toc1,
+static bool ldm_compare_tocblocks (const struct tocblock *toc1,
 				   const struct tocblock *toc2)
 {
 	BUG_ON (!toc1 || !toc2);
@@ -323,17 +318,17 @@ static BOOL ldm_compare_tocblocks (const struct tocblock *toc1,
  * the configuration area (the database).  The values are range-checked against
  * @hd, which contains the real size of the disk.
  *
- * Return:  TRUE   Success
- *          FALSE  Error
+ * Return:  'true'   Success
+ *          'false'  Error
  */
-static BOOL ldm_validate_privheads (struct block_device *bdev,
+static bool ldm_validate_privheads (struct block_device *bdev,
 				    struct privhead *ph1)
 {
 	static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
 	struct privhead *ph[3] = { ph1 };
 	Sector sect;
 	u8 *data;
-	BOOL result = FALSE;
+	bool result = false;
 	long num_sects;
 	int i;
 
@@ -393,7 +388,7 @@ static BOOL ldm_validate_privheads (struct block_device *bdev,
 		goto out;
 	}*/
 	ldm_debug ("Validated PRIVHEADs successfully.");
-	result = TRUE;
+	result = true;
 out:
 	kfree (ph[1]);
 	kfree (ph[2]);
@@ -411,10 +406,10 @@ out:
  *
  * The offsets and sizes of the configs are range-checked against a privhead.
  *
- * Return:  TRUE   @toc1 contains validated TOCBLOCK info
- *          FALSE  @toc1 contents are undefined
+ * Return:  'true'   @toc1 contains validated TOCBLOCK info
+ *          'false'  @toc1 contents are undefined
  */
-static BOOL ldm_validate_tocblocks (struct block_device *bdev,
+static bool ldm_validate_tocblocks (struct block_device *bdev,
 	unsigned long base, struct ldmdb *ldb)
 {
 	static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
@@ -422,7 +417,7 @@ static BOOL ldm_validate_tocblocks (struct block_device *bdev,
 	struct privhead *ph;
 	Sector sect;
 	u8 *data;
-	BOOL result = FALSE;
+	bool result = false;
 	int i;
 
 	BUG_ON (!bdev || !ldb);
@@ -465,7 +460,7 @@ static BOOL ldm_validate_tocblocks (struct block_device *bdev,
 	}
 
 	ldm_debug ("Validated TOCBLOCKs successfully.");
-	result = TRUE;
+	result = true;
 out:
 	kfree (tb[1]);
 	kfree (tb[2]);
@@ -482,15 +477,15 @@ out:
  * Find the vmdb of the LDM Database stored on @bdev and return the parsed
  * information in @ldb.
  *
- * Return:  TRUE   @ldb contains validated VBDB info
- *          FALSE  @ldb contents are undefined
+ * Return:  'true'   @ldb contains validated VBDB info
+ *          'false'  @ldb contents are undefined
  */
-static BOOL ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
+static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
 			       struct ldmdb *ldb)
 {
 	Sector sect;
 	u8 *data;
-	BOOL result = FALSE;
+	bool result = false;
 	struct vmdb *vm;
 	struct tocblock *toc;
 
@@ -502,7 +497,7 @@ static BOOL ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
 	data = read_dev_sector (bdev, base + OFF_VMDB, &sect);
 	if (!data) {
 		ldm_crit ("Disk read failed.");
-		return FALSE;
+		return false;
 	}
 
 	if (!ldm_parse_vmdb (data, vm))
@@ -527,7 +522,7 @@ static BOOL ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
 		goto out;
 	}
 
-	result = TRUE;
+	result = true;
 out:
 	put_dev_sector (sect);
 	return result;
@@ -547,23 +542,23 @@ out:
  *       only likely to happen if the underlying device is strange.  If that IS
  *       the case we should return zero to let someone else try.
  *
- * Return:  TRUE   @bdev is a dynamic disk
- *          FALSE  @bdev is not a dynamic disk, or an error occurred
+ * Return:  'true'   @bdev is a dynamic disk
+ *          'false'  @bdev is not a dynamic disk, or an error occurred
  */
-static BOOL ldm_validate_partition_table (struct block_device *bdev)
+static bool ldm_validate_partition_table (struct block_device *bdev)
 {
 	Sector sect;
 	u8 *data;
 	struct partition *p;
 	int i;
-	BOOL result = FALSE;
+	bool result = false;
 
 	BUG_ON (!bdev);
 
 	data = read_dev_sector (bdev, 0, &sect);
 	if (!data) {
 		ldm_crit ("Disk read failed.");
-		return FALSE;
+		return false;
 	}
 
 	if (*(__le16*) (data + 0x01FE) != cpu_to_le16 (MSDOS_LABEL_MAGIC))
@@ -572,7 +567,7 @@ static BOOL ldm_validate_partition_table (struct block_device *bdev)
 	p = (struct partition*)(data + 0x01BE);
 	for (i = 0; i < 4; i++, p++)
 		if (SYS_IND (p) == WIN2K_DYNAMIC_PARTITION) {
-			result = TRUE;
+			result = true;
 			break;
 		}
 
@@ -625,10 +620,10 @@ static struct vblk * ldm_get_disk_objid (const struct ldmdb *ldb)
  * N.B.  This function creates the partitions in the order it finds partition
  *       objects in the linked list.
  *
- * Return:  TRUE   Partition created
- *          FALSE  Error, probably a range checking problem
+ * Return:  'true'   Partition created
+ *          'false'  Error, probably a range checking problem
  */
-static BOOL ldm_create_data_partitions (struct parsed_partitions *pp,
+static bool ldm_create_data_partitions (struct parsed_partitions *pp,
 					const struct ldmdb *ldb)
 {
 	struct list_head *item;
@@ -642,7 +637,7 @@ static BOOL ldm_create_data_partitions (struct parsed_partitions *pp,
 	disk = ldm_get_disk_objid (ldb);
 	if (!disk) {
 		ldm_crit ("Can't find the ID of this disk in the database.");
-		return FALSE;
+		return false;
 	}
 
 	printk (" [LDM]");
@@ -661,7 +656,7 @@ static BOOL ldm_create_data_partitions (struct parsed_partitions *pp,
 	}
 
 	printk ("\n");
-	return TRUE;
+	return true;
 }
 
 
@@ -766,10 +761,10 @@ static int ldm_get_vstr (const u8 *block, u8 *buffer, int buflen)
  *
  * Read a raw VBLK Component object (version 3) into a vblk structure.
  *
- * Return:  TRUE   @vb contains a Component VBLK
- *          FALSE  @vb contents are not defined
+ * Return:  'true'   @vb contains a Component VBLK
+ *          'false'  @vb contents are not defined
  */
-static BOOL ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb)
+static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb)
 {
 	int r_objid, r_name, r_vstate, r_child, r_parent, r_stripe, r_cols, len;
 	struct vblk_comp *comp;
@@ -792,11 +787,11 @@ static BOOL ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb)
 		len = r_parent;
 	}
 	if (len < 0)
-		return FALSE;
+		return false;
 
 	len += VBLK_SIZE_CMP3;
 	if (len != BE32 (buffer + 0x14))
-		return FALSE;
+		return false;
 
 	comp = &vb->vblk.comp;
 	ldm_get_vstr (buffer + 0x18 + r_name, comp->state,
@@ -806,7 +801,7 @@ static BOOL ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb)
 	comp->parent_id = ldm_get_vnum (buffer + 0x2D + r_child);
 	comp->chunksize = r_stripe ? ldm_get_vnum (buffer+r_parent+0x2E) : 0;
 
-	return TRUE;
+	return true;
 }
 
 /**
@@ -817,8 +812,8 @@ static BOOL ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb)
  *
  * Read a raw VBLK Disk Group object (version 3) into a vblk structure.
  *
- * Return:  TRUE   @vb contains a Disk Group VBLK
- *          FALSE  @vb contents are not defined
+ * Return:  'true'   @vb contains a Disk Group VBLK
+ *          'false'  @vb contents are not defined
  */
 static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb)
 {
@@ -841,16 +836,16 @@ static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb)
 		len = r_diskid;
 	}
 	if (len < 0)
-		return FALSE;
+		return false;
 
 	len += VBLK_SIZE_DGR3;
 	if (len != BE32 (buffer + 0x14))
-		return FALSE;
+		return false;
 
 	dgrp = &vb->vblk.dgrp;
 	ldm_get_vstr (buffer + 0x18 + r_name, dgrp->disk_id,
 		sizeof (dgrp->disk_id));
-	return TRUE;
+	return true;
 }
 
 /**
@@ -861,10 +856,10 @@ static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb)
  *
  * Read a raw VBLK Disk Group object (version 4) into a vblk structure.
  *
- * Return:  TRUE   @vb contains a Disk Group VBLK
- *          FALSE  @vb contents are not defined
+ * Return:  'true'   @vb contains a Disk Group VBLK
+ *          'false'  @vb contents are not defined
  */
-static BOOL ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
+static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
 {
 	char buf[64];
 	int r_objid, r_name, r_id1, r_id2, len;
@@ -885,16 +880,16 @@ static BOOL ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
 		len = r_name;
 	}
 	if (len < 0)
-		return FALSE;
+		return false;
 
 	len += VBLK_SIZE_DGR4;
 	if (len != BE32 (buffer + 0x14))
-		return FALSE;
+		return false;
 
 	dgrp = &vb->vblk.dgrp;
 
 	ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf));
-	return TRUE;
+	return true;
 }
 
 /**
@@ -905,10 +900,10 @@ static BOOL ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
  *
  * Read a raw VBLK Disk object (version 3) into a vblk structure.
  *
- * Return:  TRUE   @vb contains a Disk VBLK
- *          FALSE  @vb contents are not defined
+ * Return:  'true'   @vb contains a Disk VBLK
+ *          'false'  @vb contents are not defined
  */
-static BOOL ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb)
+static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb)
 {
 	int r_objid, r_name, r_diskid, r_altname, len;
 	struct vblk_disk *disk;
@@ -921,19 +916,19 @@ static BOOL ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb)
 	r_altname = ldm_relative (buffer, buflen, 0x18, r_diskid);
 	len = r_altname;
 	if (len < 0)
-		return FALSE;
+		return false;
 
 	len += VBLK_SIZE_DSK3;
 	if (len != BE32 (buffer + 0x14))
-		return FALSE;
+		return false;
 
 	disk = &vb->vblk.disk;
 	ldm_get_vstr (buffer + 0x18 + r_diskid, disk->alt_name,
 		sizeof (disk->alt_name));
 	if (!ldm_parse_guid (buffer + 0x19 + r_name, disk->disk_id))
-		return FALSE;
+		return false;
 
-	return TRUE;
+	return true;
 }
 
 /**
@@ -944,10 +939,10 @@ static BOOL ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb)
  *
  * Read a raw VBLK Disk object (version 4) into a vblk structure.
  *
- * Return:  TRUE   @vb contains a Disk VBLK
- *          FALSE  @vb contents are not defined
+ * Return:  'true'   @vb contains a Disk VBLK
+ *          'false'  @vb contents are not defined
  */
-static BOOL ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb)
+static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb)
 {
 	int r_objid, r_name, len;
 	struct vblk_disk *disk;
@@ -958,15 +953,15 @@ static BOOL ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb)
 	r_name  = ldm_relative (buffer, buflen, 0x18, r_objid);
 	len     = r_name;
 	if (len < 0)
-		return FALSE;
+		return false;
 
 	len += VBLK_SIZE_DSK4;
 	if (len != BE32 (buffer + 0x14))
-		return FALSE;
+		return false;
 
 	disk = &vb->vblk.disk;
 	memcpy (disk->disk_id, buffer + 0x18 + r_name, GUID_SIZE);
-	return TRUE;
+	return true;
 }
 
 /**
@@ -977,10 +972,10 @@ static BOOL ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb)
  *
  * Read a raw VBLK Partition object (version 3) into a vblk structure.
  *
- * Return:  TRUE   @vb contains a Partition VBLK
- *          FALSE  @vb contents are not defined
+ * Return:  'true'   @vb contains a Partition VBLK
+ *          'false'  @vb contents are not defined
  */
-static BOOL ldm_parse_prt3 (const u8 *buffer, int buflen, struct vblk *vb)
+static bool ldm_parse_prt3 (const u8 *buffer, int buflen, struct vblk *vb)
 {
 	int r_objid, r_name, r_size, r_parent, r_diskid, r_index, len;
 	struct vblk_part *part;
@@ -1001,11 +996,11 @@ static BOOL ldm_parse_prt3 (const u8 *buffer, int buflen, struct vblk *vb)
 		len = r_diskid;
 	}
 	if (len < 0)
-		return FALSE;
+		return false;
 
 	len += VBLK_SIZE_PRT3;
 	if (len != BE32 (buffer + 0x14))
-		return FALSE;
+		return false;
 
 	part = &vb->vblk.part;
 	part->start         = BE64         (buffer + 0x24 + r_name);
@@ -1018,7 +1013,7 @@ static BOOL ldm_parse_prt3 (const u8 *buffer, int buflen, struct vblk *vb)
 	else
 		part->partnum = 0;
 
-	return TRUE;
+	return true;
 }
 
 /**
@@ -1029,10 +1024,10 @@ static BOOL ldm_parse_prt3 (const u8 *buffer, int buflen, struct vblk *vb)
  *
  * Read a raw VBLK Volume object (version 5) into a vblk structure.
  *
- * Return:  TRUE   @vb contains a Volume VBLK
- *          FALSE  @vb contents are not defined
+ * Return:  'true'   @vb contains a Volume VBLK
+ *          'false'  @vb contents are not defined
  */
-static BOOL ldm_parse_vol5 (const u8 *buffer, int buflen, struct vblk *vb)
+static bool ldm_parse_vol5 (const u8 *buffer, int buflen, struct vblk *vb)
 {
 	int r_objid, r_name, r_vtype, r_child, r_size, r_id1, r_id2, r_size2;
 	int r_drive, len;
@@ -1068,11 +1063,11 @@ static BOOL ldm_parse_vol5 (const u8 *buffer, int buflen, struct vblk *vb)
 
 	len = r_drive;
 	if (len < 0)
-		return FALSE;
+		return false;
 
 	len += VBLK_SIZE_VOL5;
 	if (len != BE32 (buffer + 0x14))
-		return FALSE;
+		return false;
 
 	volu = &vb->vblk.volu;
 
@@ -1087,7 +1082,7 @@ static BOOL ldm_parse_vol5 (const u8 *buffer, int buflen, struct vblk *vb)
 		ldm_get_vstr (buffer + 0x53 + r_size,  volu->drive_hint,
 			sizeof (volu->drive_hint));
 	}
-	return TRUE;
+	return true;
 }
 
 /**
@@ -1100,12 +1095,12 @@ static BOOL ldm_parse_vol5 (const u8 *buffer, int buflen, struct vblk *vb)
  * information common to all VBLK types, then delegates the rest of the work to
  * helper functions: ldm_parse_*.
  *
- * Return:  TRUE   @vb contains a VBLK
- *          FALSE  @vb contents are not defined
+ * Return:  'true'   @vb contains a VBLK
+ *          'false'  @vb contents are not defined
  */
-static BOOL ldm_parse_vblk (const u8 *buf, int len, struct vblk *vb)
+static bool ldm_parse_vblk (const u8 *buf, int len, struct vblk *vb)
 {
-	BOOL result = FALSE;
+	bool result = false;
 	int r_objid;
 
 	BUG_ON (!buf || !vb);
@@ -1113,7 +1108,7 @@ static BOOL ldm_parse_vblk (const u8 *buf, int len, struct vblk *vb)
 	r_objid = ldm_relative (buf, len, 0x18, 0);
 	if (r_objid < 0) {
 		ldm_error ("VBLK header is corrupt.");
-		return FALSE;
+		return false;
 	}
 
 	vb->flags  = buf[0x12];
@@ -1152,10 +1147,10 @@ static BOOL ldm_parse_vblk (const u8 *buf, int len, struct vblk *vb)
  *
  * N.B.  This function does not check the validity of the VBLKs.
  *
- * Return:  TRUE   The VBLK was added
- *          FALSE  An error occurred
+ * Return:  'true'   The VBLK was added
+ *          'false'  An error occurred
  */
-static BOOL ldm_ldmdb_add (u8 *data, int len, struct ldmdb *ldb)
+static bool ldm_ldmdb_add (u8 *data, int len, struct ldmdb *ldb)
 {
 	struct vblk *vb;
 	struct list_head *item;
@@ -1165,12 +1160,12 @@ static BOOL ldm_ldmdb_add (u8 *data, int len, struct ldmdb *ldb)
 	vb = kmalloc (sizeof (*vb), GFP_KERNEL);
 	if (!vb) {
 		ldm_crit ("Out of memory.");
-		return FALSE;
+		return false;
 	}
 
 	if (!ldm_parse_vblk (data, len, vb)) {
 		kfree(vb);
-		return FALSE;			/* Already logged */
+		return false;			/* Already logged */
 	}
 
 	/* Put vblk into the correct list. */
@@ -1196,13 +1191,13 @@ static BOOL ldm_ldmdb_add (u8 *data, int len, struct ldmdb *ldb)
 			if ((v->vblk.part.disk_id == vb->vblk.part.disk_id) &&
 			    (v->vblk.part.start > vb->vblk.part.start)) {
 				list_add_tail (&vb->list, &v->list);
-				return TRUE;
+				return true;
 			}
 		}
 		list_add_tail (&vb->list, &ldb->v_part);
 		break;
 	}
-	return TRUE;
+	return true;
 }
 
 /**
@@ -1214,10 +1209,10 @@ static BOOL ldm_ldmdb_add (u8 *data, int len, struct ldmdb *ldb)
  * Fragmented VBLKs may not be consecutive in the database, so they are placed
  * in a list so they can be pieced together later.
  *
- * Return:  TRUE   Success, the VBLK was added to the list
- *          FALSE  Error, a problem occurred
+ * Return:  'true'   Success, the VBLK was added to the list
+ *          'false'  Error, a problem occurred
  */
-static BOOL ldm_frag_add (const u8 *data, int size, struct list_head *frags)
+static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags)
 {
 	struct frag *f;
 	struct list_head *item;
@@ -1230,7 +1225,7 @@ static BOOL ldm_frag_add (const u8 *data, int size, struct list_head *frags)
 	num   = BE16 (data + 0x0E);
 	if ((num < 1) || (num > 4)) {
 		ldm_error ("A VBLK claims to have %d parts.", num);
-		return FALSE;
+		return false;
 	}
 
 	list_for_each (item, frags) {
@@ -1242,7 +1237,7 @@ static BOOL ldm_frag_add (const u8 *data, int size, struct list_head *frags)
 	f = kmalloc (sizeof (*f) + size*num, GFP_KERNEL);
 	if (!f) {
 		ldm_crit ("Out of memory.");
-		return FALSE;
+		return false;
 	}
 
 	f->group = group;
@@ -1255,7 +1250,7 @@ found:
 	if (f->map & (1 << rec)) {
 		ldm_error ("Duplicate VBLK, part %d.", rec);
 		f->map &= 0x7F;			/* Mark the group as broken */
-		return FALSE;
+		return false;
 	}
 
 	f->map |= (1 << rec);
@@ -1266,7 +1261,7 @@ found:
 	}
 	memcpy (f->data+rec*(size-VBLK_SIZE_HEAD)+VBLK_SIZE_HEAD, data, size);
 
-	return TRUE;
+	return true;
 }
 
 /**
@@ -1295,10 +1290,10 @@ static void ldm_frag_free (struct list_head *list)
  * Now that all the fragmented VBLKs have been collected, they must be added to
  * the database for later use.
  *
- * Return:  TRUE   All the fragments we added successfully
- *          FALSE  One or more of the fragments we invalid
+ * Return:  'true'   All the fragments we added successfully
+ *          'false'  One or more of the fragments we invalid
  */
-static BOOL ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
+static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
 {
 	struct frag *f;
 	struct list_head *item;
@@ -1311,13 +1306,13 @@ static BOOL ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
 		if (f->map != 0xFF) {
 			ldm_error ("VBLK group %d is incomplete (0x%02x).",
 				f->group, f->map);
-			return FALSE;
+			return false;
 		}
 
 		if (!ldm_ldmdb_add (f->data, f->num*ldb->vm.vblk_size, ldb))
-			return FALSE;		/* Already logged */
+			return false;		/* Already logged */
 	}
-	return TRUE;
+	return true;
 }
 
 /**
@@ -1329,16 +1324,16 @@ static BOOL ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
  * To use the information from the VBLKs, they need to be read from the disk,
  * unpacked and validated.  We cache them in @ldb according to their type.
  *
- * Return:  TRUE   All the VBLKs were read successfully
- *          FALSE  An error occurred
+ * Return:  'true'   All the VBLKs were read successfully
+ *          'false'  An error occurred
  */
-static BOOL ldm_get_vblks (struct block_device *bdev, unsigned long base,
+static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
 			   struct ldmdb *ldb)
 {
 	int size, perbuf, skip, finish, s, v, recs;
 	u8 *data = NULL;
 	Sector sect;
-	BOOL result = FALSE;
+	bool result = false;
 	LIST_HEAD (frags);
 
 	BUG_ON (!bdev || !ldb);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 8f12587c3129..8c7af1777819 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -32,13 +32,11 @@
 #include <asm/unaligned.h>
 
 #define SYS_IND(p)	(get_unaligned(&p->sys_ind))
-#define NR_SECTS(p)	({ __typeof__(p->nr_sects) __a =	\
-				get_unaligned(&p->nr_sects);	\
+#define NR_SECTS(p)	({ __le32 __a =	get_unaligned(&p->nr_sects);	\
 				le32_to_cpu(__a); \
 			})
 
-#define START_SECT(p)	({ __typeof__(p->start_sect) __a =	\
-				get_unaligned(&p->start_sect);	\
+#define START_SECT(p)	({ __le32 __a =	get_unaligned(&p->start_sect);	\
 				le32_to_cpu(__a); \
 			})
 
@@ -58,6 +56,31 @@ msdos_magic_present(unsigned char *p)
 	return (p[0] == MSDOS_LABEL_MAGIC1 && p[1] == MSDOS_LABEL_MAGIC2);
 }
 
+/* Value is EBCDIC 'IBMA' */
+#define AIX_LABEL_MAGIC1	0xC9
+#define AIX_LABEL_MAGIC2	0xC2
+#define AIX_LABEL_MAGIC3	0xD4
+#define AIX_LABEL_MAGIC4	0xC1
+static int aix_magic_present(unsigned char *p, struct block_device *bdev)
+{
+	Sector sect;
+	unsigned char *d;
+	int ret = 0;
+
+	if (p[0] != AIX_LABEL_MAGIC1 &&
+		p[1] != AIX_LABEL_MAGIC2 &&
+		p[2] != AIX_LABEL_MAGIC3 &&
+		p[3] != AIX_LABEL_MAGIC4)
+		return 0;
+	d = read_dev_sector(bdev, 7, &sect);
+	if (d) {
+		if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
+			ret = 1;
+		put_dev_sector(sect);
+	};
+	return ret;
+}
+
 /*
  * Create devices for each logical partition in an extended partition.
  * The logical partitions form a linked list, with each entry being
@@ -393,6 +416,12 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 		return 0;
 	}
 
+	if (aix_magic_present(data, bdev)) {
+		put_dev_sector(sect);
+		printk( " [AIX]");
+		return 0;
+	}
+
 	/*
 	 * Now that the 55aa signature is present, this is probably
 	 * either the boot sector of a FAT filesystem or a DOS-type
diff --git a/fs/pipe.c b/fs/pipe.c
index 20352573e025..b1626f269a34 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -218,9 +218,10 @@ static struct pipe_buf_operations anon_pipe_buf_ops = {
 };
 
 static ssize_t
-pipe_readv(struct file *filp, const struct iovec *_iov,
-	   unsigned long nr_segs, loff_t *ppos)
+pipe_read(struct kiocb *iocb, const struct iovec *_iov,
+	   unsigned long nr_segs, loff_t pos)
 {
+	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct pipe_inode_info *pipe;
 	int do_wakeup;
@@ -330,17 +331,10 @@ redo:
 }
 
 static ssize_t
-pipe_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
-{
-	struct iovec iov = { .iov_base = buf, .iov_len = count };
-
-	return pipe_readv(filp, &iov, 1, ppos);
-}
-
-static ssize_t
-pipe_writev(struct file *filp, const struct iovec *_iov,
-	    unsigned long nr_segs, loff_t *ppos)
+pipe_write(struct kiocb *iocb, const struct iovec *_iov,
+	    unsigned long nr_segs, loff_t ppos)
 {
+	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct pipe_inode_info *pipe;
 	ssize_t ret;
@@ -510,15 +504,6 @@ out:
 }
 
 static ssize_t
-pipe_write(struct file *filp, const char __user *buf,
-	   size_t count, loff_t *ppos)
-{
-	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
-
-	return pipe_writev(filp, &iov, 1, ppos);
-}
-
-static ssize_t
 bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
 {
 	return -EBADF;
@@ -736,8 +721,8 @@ pipe_rdwr_open(struct inode *inode, struct file *filp)
  */
 const struct file_operations read_fifo_fops = {
 	.llseek		= no_llseek,
-	.read		= pipe_read,
-	.readv		= pipe_readv,
+	.read		= do_sync_read,
+	.aio_read	= pipe_read,
 	.write		= bad_pipe_w,
 	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
@@ -749,8 +734,8 @@ const struct file_operations read_fifo_fops = {
 const struct file_operations write_fifo_fops = {
 	.llseek		= no_llseek,
 	.read		= bad_pipe_r,
-	.write		= pipe_write,
-	.writev		= pipe_writev,
+	.write		= do_sync_write,
+	.aio_write	= pipe_write,
 	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
 	.open		= pipe_write_open,
@@ -760,10 +745,10 @@ const struct file_operations write_fifo_fops = {
 
 const struct file_operations rdwr_fifo_fops = {
 	.llseek		= no_llseek,
-	.read		= pipe_read,
-	.readv		= pipe_readv,
-	.write		= pipe_write,
-	.writev		= pipe_writev,
+	.read		= do_sync_read,
+	.aio_read	= pipe_read,
+	.write		= do_sync_write,
+	.aio_write	= pipe_write,
 	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
 	.open		= pipe_rdwr_open,
@@ -773,8 +758,8 @@ const struct file_operations rdwr_fifo_fops = {
 
 static struct file_operations read_pipe_fops = {
 	.llseek		= no_llseek,
-	.read		= pipe_read,
-	.readv		= pipe_readv,
+	.read		= do_sync_read,
+	.aio_read	= pipe_read,
 	.write		= bad_pipe_w,
 	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
@@ -786,8 +771,8 @@ static struct file_operations read_pipe_fops = {
 static struct file_operations write_pipe_fops = {
 	.llseek		= no_llseek,
 	.read		= bad_pipe_r,
-	.write		= pipe_write,
-	.writev		= pipe_writev,
+	.write		= do_sync_write,
+	.aio_write	= pipe_write,
 	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
 	.open		= pipe_write_open,
@@ -797,10 +782,10 @@ static struct file_operations write_pipe_fops = {
 
 static struct file_operations rdwr_pipe_fops = {
 	.llseek		= no_llseek,
-	.read		= pipe_read,
-	.readv		= pipe_readv,
-	.write		= pipe_write,
-	.writev		= pipe_writev,
+	.read		= do_sync_read,
+	.aio_read	= pipe_read,
+	.write		= do_sync_write,
+	.aio_write	= pipe_write,
 	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
 	.open		= pipe_rdwr_open,
@@ -879,7 +864,6 @@ static struct inode * get_pipe_inode(void)
 	inode->i_uid = current->fsuid;
 	inode->i_gid = current->fsgid;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	inode->i_blksize = PAGE_SIZE;
 
 	return inode;
 
@@ -890,87 +874,118 @@ fail_inode:
 	return NULL;
 }
 
-int do_pipe(int *fd)
+struct file *create_write_pipe(void)
 {
-	struct qstr this;
-	char name[32];
+	int err;
+	struct inode *inode;
+	struct file *f;
 	struct dentry *dentry;
-	struct inode * inode;
-	struct file *f1, *f2;
-	int error;
-	int i, j;
-
-	error = -ENFILE;
-	f1 = get_empty_filp();
-	if (!f1)
-		goto no_files;
-
-	f2 = get_empty_filp();
-	if (!f2)
-		goto close_f1;
+	char name[32];
+	struct qstr this;
 
+	f = get_empty_filp();
+	if (!f)
+		return ERR_PTR(-ENFILE);
+	err = -ENFILE;
 	inode = get_pipe_inode();
 	if (!inode)
-		goto close_f12;
-
-	error = get_unused_fd();
-	if (error < 0)
-		goto close_f12_inode;
-	i = error;
-
-	error = get_unused_fd();
-	if (error < 0)
-		goto close_f12_inode_i;
-	j = error;
+		goto err_file;
 
-	error = -ENOMEM;
 	sprintf(name, "[%lu]", inode->i_ino);
 	this.name = name;
 	this.len = strlen(name);
 	this.hash = inode->i_ino; /* will go */
+	err = -ENOMEM;
 	dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &this);
 	if (!dentry)
-		goto close_f12_inode_i_j;
+		goto err_inode;
 
 	dentry->d_op = &pipefs_dentry_operations;
 	d_add(dentry, inode);
-	f1->f_vfsmnt = f2->f_vfsmnt = mntget(mntget(pipe_mnt));
-	f1->f_dentry = f2->f_dentry = dget(dentry);
-	f1->f_mapping = f2->f_mapping = inode->i_mapping;
-
-	/* read file */
-	f1->f_pos = f2->f_pos = 0;
-	f1->f_flags = O_RDONLY;
-	f1->f_op = &read_pipe_fops;
-	f1->f_mode = FMODE_READ;
-	f1->f_version = 0;
-
-	/* write file */
-	f2->f_flags = O_WRONLY;
-	f2->f_op = &write_pipe_fops;
-	f2->f_mode = FMODE_WRITE;
-	f2->f_version = 0;
-
-	fd_install(i, f1);
-	fd_install(j, f2);
-	fd[0] = i;
-	fd[1] = j;
+	f->f_vfsmnt = mntget(pipe_mnt);
+	f->f_dentry = dentry;
+	f->f_mapping = inode->i_mapping;
 
-	return 0;
+	f->f_flags = O_WRONLY;
+	f->f_op = &write_pipe_fops;
+	f->f_mode = FMODE_WRITE;
+	f->f_version = 0;
+
+	return f;
 
-close_f12_inode_i_j:
-	put_unused_fd(j);
-close_f12_inode_i:
-	put_unused_fd(i);
-close_f12_inode:
+ err_inode:
 	free_pipe_info(inode);
 	iput(inode);
-close_f12:
-	put_filp(f2);
-close_f1:
-	put_filp(f1);
-no_files:
-	return error;	
+ err_file:
+	put_filp(f);
+	return ERR_PTR(err);
+}
+
+void free_write_pipe(struct file *f)
+{
+	mntput(f->f_vfsmnt);
+	dput(f->f_dentry);
+	put_filp(f);
+}
+
+struct file *create_read_pipe(struct file *wrf)
+{
+	struct file *f = get_empty_filp();
+	if (!f)
+		return ERR_PTR(-ENFILE);
+
+	/* Grab pipe from the writer */
+	f->f_vfsmnt = mntget(wrf->f_vfsmnt);
+	f->f_dentry = dget(wrf->f_dentry);
+	f->f_mapping = wrf->f_dentry->d_inode->i_mapping;
+
+	f->f_pos = 0;
+	f->f_flags = O_RDONLY;
+	f->f_op = &read_pipe_fops;
+	f->f_mode = FMODE_READ;
+	f->f_version = 0;
+
+	return f;
+}
+
+int do_pipe(int *fd)
+{
+	struct file *fw, *fr;
+	int error;
+	int fdw, fdr;
+
+	fw = create_write_pipe();
+	if (IS_ERR(fw))
+		return PTR_ERR(fw);
+	fr = create_read_pipe(fw);
+	error = PTR_ERR(fr);
+	if (IS_ERR(fr))
+		goto err_write_pipe;
+
+	error = get_unused_fd();
+	if (error < 0)
+		goto err_read_pipe;
+	fdr = error;
+
+	error = get_unused_fd();
+	if (error < 0)
+		goto err_fdr;
+	fdw = error;
+
+	fd_install(fdr, fr);
+	fd_install(fdw, fw);
+	fd[0] = fdr;
+	fd[1] = fdw;
+
+	return 0;
+
+ err_fdr:
+	put_unused_fd(fdr);
+ err_read_pipe:
+	put_filp(fr);
+ err_write_pipe:
+	free_write_pipe(fw);
+	return error;
 }
 
 /*
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 6c8dcf7613fd..aec931e09973 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -58,11 +58,9 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
 	if (acl) {
 		int size = sizeof(struct posix_acl) + acl->a_count *
 		           sizeof(struct posix_acl_entry);
-		clone = kmalloc(size, flags);
-		if (clone) {
-			memcpy(clone, acl, size);
+		clone = kmemdup(acl, size, flags);
+		if (clone)
 			atomic_set(&clone->a_refcount, 1);
-		}
 	}
 	return clone;
 }
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 0b615d62a159..25e917fb4739 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -162,7 +162,7 @@ static inline char * task_state(struct task_struct *p, char *buffer)
 	int g;
 	struct fdtable *fdt = NULL;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	buffer += sprintf(buffer,
 		"State:\t%s\n"
 		"SleepAVG:\t%lu%%\n"
@@ -174,14 +174,13 @@ static inline char * task_state(struct task_struct *p, char *buffer)
 		"Gid:\t%d\t%d\t%d\t%d\n",
 		get_task_state(p),
 		(p->sleep_avg/1024)*100/(1020000000/1024),
-	       	p->tgid,
-		p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0,
-		pid_alive(p) && p->ptrace ? p->parent->pid : 0,
+	       	p->tgid, p->pid,
+	       	pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
+		pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
 		p->uid, p->euid, p->suid, p->fsuid,
 		p->gid, p->egid, p->sgid, p->fsgid);
-	read_unlock(&tasklist_lock);
+
 	task_lock(p);
-	rcu_read_lock();
 	if (p->files)
 		fdt = files_fdtable(p->files);
 	buffer += sprintf(buffer,
@@ -244,6 +243,7 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
 
 static inline char * task_sig(struct task_struct *p, char *buffer)
 {
+	unsigned long flags;
 	sigset_t pending, shpending, blocked, ignored, caught;
 	int num_threads = 0;
 	unsigned long qsize = 0;
@@ -255,10 +255,8 @@ static inline char * task_sig(struct task_struct *p, char *buffer)
 	sigemptyset(&ignored);
 	sigemptyset(&caught);
 
-	/* Gather all the data with the appropriate locks held */
-	read_lock(&tasklist_lock);
-	if (p->sighand) {
-		spin_lock_irq(&p->sighand->siglock);
+	rcu_read_lock();
+	if (lock_task_sighand(p, &flags)) {
 		pending = p->pending.signal;
 		shpending = p->signal->shared_pending.signal;
 		blocked = p->blocked;
@@ -266,9 +264,9 @@ static inline char * task_sig(struct task_struct *p, char *buffer)
 		num_threads = atomic_read(&p->signal->count);
 		qsize = atomic_read(&p->user->sigpending);
 		qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
-		spin_unlock_irq(&p->sighand->siglock);
+		unlock_task_sighand(p, &flags);
 	}
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	buffer += sprintf(buffer, "Threads:\t%d\n", num_threads);
 	buffer += sprintf(buffer, "SigQ:\t%lu/%lu\n", qsize, qlim);
@@ -322,7 +320,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 	sigset_t sigign, sigcatch;
 	char state;
 	int res;
- 	pid_t ppid, pgid = -1, sid = -1;
+ 	pid_t ppid = 0, pgid = -1, sid = -1;
 	int num_threads = 0;
 	struct mm_struct *mm;
 	unsigned long long start_time;
@@ -330,8 +328,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 	unsigned long  min_flt = 0,  maj_flt = 0;
 	cputime_t cutime, cstime, utime, stime;
 	unsigned long rsslim = 0;
-	struct task_struct *t;
 	char tcomm[sizeof(task->comm)];
+	unsigned long flags;
 
 	state = *get_task_state(task);
 	vsize = eip = esp = 0;
@@ -347,15 +345,35 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 	sigemptyset(&sigign);
 	sigemptyset(&sigcatch);
 	cutime = cstime = utime = stime = cputime_zero;
-	read_lock(&tasklist_lock);
-	if (task->sighand) {
-		spin_lock_irq(&task->sighand->siglock);
-		num_threads = atomic_read(&task->signal->count);
+
+	mutex_lock(&tty_mutex);
+	rcu_read_lock();
+	if (lock_task_sighand(task, &flags)) {
+		struct signal_struct *sig = task->signal;
+		struct tty_struct *tty = sig->tty;
+
+		if (tty) {
+			/*
+			 * sig->tty is not stable, but tty_mutex
+			 * protects us from release_dev(tty)
+			 */
+			barrier();
+			tty_pgrp = tty->pgrp;
+			tty_nr = new_encode_dev(tty_devnum(tty));
+		}
+
+		num_threads = atomic_read(&sig->count);
 		collect_sigign_sigcatch(task, &sigign, &sigcatch);
 
+		cmin_flt = sig->cmin_flt;
+		cmaj_flt = sig->cmaj_flt;
+		cutime = sig->cutime;
+		cstime = sig->cstime;
+		rsslim = sig->rlim[RLIMIT_RSS].rlim_cur;
+
 		/* add up live thread stats at the group level */
 		if (whole) {
-			t = task;
+			struct task_struct *t = task;
 			do {
 				min_flt += t->min_flt;
 				maj_flt += t->maj_flt;
@@ -363,31 +381,21 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 				stime = cputime_add(stime, t->stime);
 				t = next_thread(t);
 			} while (t != task);
-		}
 
-		spin_unlock_irq(&task->sighand->siglock);
-	}
-	if (task->signal) {
-		if (task->signal->tty) {
-			tty_pgrp = task->signal->tty->pgrp;
-			tty_nr = new_encode_dev(tty_devnum(task->signal->tty));
+			min_flt += sig->min_flt;
+			maj_flt += sig->maj_flt;
+			utime = cputime_add(utime, sig->utime);
+			stime = cputime_add(stime, sig->stime);
 		}
+
+		sid = sig->session;
 		pgid = process_group(task);
-		sid = task->signal->session;
-		cmin_flt = task->signal->cmin_flt;
-		cmaj_flt = task->signal->cmaj_flt;
-		cutime = task->signal->cutime;
-		cstime = task->signal->cstime;
-		rsslim = task->signal->rlim[RLIMIT_RSS].rlim_cur;
-		if (whole) {
-			min_flt += task->signal->min_flt;
-			maj_flt += task->signal->maj_flt;
-			utime = cputime_add(utime, task->signal->utime);
-			stime = cputime_add(stime, task->signal->stime);
-		}
+		ppid = rcu_dereference(task->real_parent)->tgid;
+
+		unlock_task_sighand(task, &flags);
 	}
-	ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0;
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
+	mutex_unlock(&tty_mutex);
 
 	if (!whole || num_threads<2)
 		wchan = get_wchan(task);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index fe8d55fb17cc..8df27401d292 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -71,6 +71,8 @@
 #include <linux/cpuset.h>
 #include <linux/audit.h>
 #include <linux/poll.h>
+#include <linux/nsproxy.h>
+#include <linux/oom.h>
 #include "internal.h"
 
 /* NOTE:
@@ -83,262 +85,44 @@
  *	in /proc for a task before it execs a suid executable.
  */
 
-/*
- * For hysterical raisins we keep the same inumbers as in the old procfs.
- * Feel free to change the macro below - just keep the range distinct from
- * inumbers of the rest of procfs (currently those are in 0x0000--0xffff).
- * As soon as we'll get a separate superblock we will be able to forget
- * about magical ranges too.
- */
-
-#define fake_ino(pid,ino) (((pid)<<16)|(ino))
-
-enum pid_directory_inos {
-	PROC_TGID_INO = 2,
-	PROC_TGID_TASK,
-	PROC_TGID_STATUS,
-	PROC_TGID_MEM,
-#ifdef CONFIG_SECCOMP
-	PROC_TGID_SECCOMP,
-#endif
-	PROC_TGID_CWD,
-	PROC_TGID_ROOT,
-	PROC_TGID_EXE,
-	PROC_TGID_FD,
-	PROC_TGID_ENVIRON,
-	PROC_TGID_AUXV,
-	PROC_TGID_CMDLINE,
-	PROC_TGID_STAT,
-	PROC_TGID_STATM,
-	PROC_TGID_MAPS,
-	PROC_TGID_NUMA_MAPS,
-	PROC_TGID_MOUNTS,
-	PROC_TGID_MOUNTSTATS,
-	PROC_TGID_WCHAN,
-#ifdef CONFIG_MMU
-	PROC_TGID_SMAPS,
-#endif
-#ifdef CONFIG_SCHEDSTATS
-	PROC_TGID_SCHEDSTAT,
-#endif
-#ifdef CONFIG_CPUSETS
-	PROC_TGID_CPUSET,
-#endif
-#ifdef CONFIG_SECURITY
-	PROC_TGID_ATTR,
-	PROC_TGID_ATTR_CURRENT,
-	PROC_TGID_ATTR_PREV,
-	PROC_TGID_ATTR_EXEC,
-	PROC_TGID_ATTR_FSCREATE,
-	PROC_TGID_ATTR_KEYCREATE,
-	PROC_TGID_ATTR_SOCKCREATE,
-#endif
-#ifdef CONFIG_AUDITSYSCALL
-	PROC_TGID_LOGINUID,
-#endif
-	PROC_TGID_OOM_SCORE,
-	PROC_TGID_OOM_ADJUST,
-	PROC_TID_INO,
-	PROC_TID_STATUS,
-	PROC_TID_MEM,
-#ifdef CONFIG_SECCOMP
-	PROC_TID_SECCOMP,
-#endif
-	PROC_TID_CWD,
-	PROC_TID_ROOT,
-	PROC_TID_EXE,
-	PROC_TID_FD,
-	PROC_TID_ENVIRON,
-	PROC_TID_AUXV,
-	PROC_TID_CMDLINE,
-	PROC_TID_STAT,
-	PROC_TID_STATM,
-	PROC_TID_MAPS,
-	PROC_TID_NUMA_MAPS,
-	PROC_TID_MOUNTS,
-	PROC_TID_MOUNTSTATS,
-	PROC_TID_WCHAN,
-#ifdef CONFIG_MMU
-	PROC_TID_SMAPS,
-#endif
-#ifdef CONFIG_SCHEDSTATS
-	PROC_TID_SCHEDSTAT,
-#endif
-#ifdef CONFIG_CPUSETS
-	PROC_TID_CPUSET,
-#endif
-#ifdef CONFIG_SECURITY
-	PROC_TID_ATTR,
-	PROC_TID_ATTR_CURRENT,
-	PROC_TID_ATTR_PREV,
-	PROC_TID_ATTR_EXEC,
-	PROC_TID_ATTR_FSCREATE,
-	PROC_TID_ATTR_KEYCREATE,
-	PROC_TID_ATTR_SOCKCREATE,
-#endif
-#ifdef CONFIG_AUDITSYSCALL
-	PROC_TID_LOGINUID,
-#endif
-	PROC_TID_OOM_SCORE,
-	PROC_TID_OOM_ADJUST,
-
-	/* Add new entries before this */
-	PROC_TID_FD_DIR = 0x8000,	/* 0x8000-0xffff */
-};
 
 /* Worst case buffer size needed for holding an integer. */
-#define PROC_NUMBUF 10
+#define PROC_NUMBUF 13
 
 struct pid_entry {
-	int type;
 	int len;
 	char *name;
 	mode_t mode;
+	struct inode_operations *iop;
+	struct file_operations *fop;
+	union proc_op op;
 };
 
-#define E(type,name,mode) {(type),sizeof(name)-1,(name),(mode)}
-
-static struct pid_entry tgid_base_stuff[] = {
-	E(PROC_TGID_TASK,      "task",    S_IFDIR|S_IRUGO|S_IXUGO),
-	E(PROC_TGID_FD,        "fd",      S_IFDIR|S_IRUSR|S_IXUSR),
-	E(PROC_TGID_ENVIRON,   "environ", S_IFREG|S_IRUSR),
-	E(PROC_TGID_AUXV,      "auxv",	  S_IFREG|S_IRUSR),
-	E(PROC_TGID_STATUS,    "status",  S_IFREG|S_IRUGO),
-	E(PROC_TGID_CMDLINE,   "cmdline", S_IFREG|S_IRUGO),
-	E(PROC_TGID_STAT,      "stat",    S_IFREG|S_IRUGO),
-	E(PROC_TGID_STATM,     "statm",   S_IFREG|S_IRUGO),
-	E(PROC_TGID_MAPS,      "maps",    S_IFREG|S_IRUGO),
-#ifdef CONFIG_NUMA
-	E(PROC_TGID_NUMA_MAPS, "numa_maps", S_IFREG|S_IRUGO),
-#endif
-	E(PROC_TGID_MEM,       "mem",     S_IFREG|S_IRUSR|S_IWUSR),
-#ifdef CONFIG_SECCOMP
-	E(PROC_TGID_SECCOMP,   "seccomp", S_IFREG|S_IRUSR|S_IWUSR),
-#endif
-	E(PROC_TGID_CWD,       "cwd",     S_IFLNK|S_IRWXUGO),
-	E(PROC_TGID_ROOT,      "root",    S_IFLNK|S_IRWXUGO),
-	E(PROC_TGID_EXE,       "exe",     S_IFLNK|S_IRWXUGO),
-	E(PROC_TGID_MOUNTS,    "mounts",  S_IFREG|S_IRUGO),
-	E(PROC_TGID_MOUNTSTATS, "mountstats", S_IFREG|S_IRUSR),
-#ifdef CONFIG_MMU
-	E(PROC_TGID_SMAPS,     "smaps",   S_IFREG|S_IRUGO),
-#endif
-#ifdef CONFIG_SECURITY
-	E(PROC_TGID_ATTR,      "attr",    S_IFDIR|S_IRUGO|S_IXUGO),
-#endif
-#ifdef CONFIG_KALLSYMS
-	E(PROC_TGID_WCHAN,     "wchan",   S_IFREG|S_IRUGO),
-#endif
-#ifdef CONFIG_SCHEDSTATS
-	E(PROC_TGID_SCHEDSTAT, "schedstat", S_IFREG|S_IRUGO),
-#endif
-#ifdef CONFIG_CPUSETS
-	E(PROC_TGID_CPUSET,    "cpuset",  S_IFREG|S_IRUGO),
-#endif
-	E(PROC_TGID_OOM_SCORE, "oom_score",S_IFREG|S_IRUGO),
-	E(PROC_TGID_OOM_ADJUST,"oom_adj", S_IFREG|S_IRUGO|S_IWUSR),
-#ifdef CONFIG_AUDITSYSCALL
-	E(PROC_TGID_LOGINUID, "loginuid", S_IFREG|S_IWUSR|S_IRUGO),
-#endif
-	{0,0,NULL,0}
-};
-static struct pid_entry tid_base_stuff[] = {
-	E(PROC_TID_FD,         "fd",      S_IFDIR|S_IRUSR|S_IXUSR),
-	E(PROC_TID_ENVIRON,    "environ", S_IFREG|S_IRUSR),
-	E(PROC_TID_AUXV,       "auxv",	  S_IFREG|S_IRUSR),
-	E(PROC_TID_STATUS,     "status",  S_IFREG|S_IRUGO),
-	E(PROC_TID_CMDLINE,    "cmdline", S_IFREG|S_IRUGO),
-	E(PROC_TID_STAT,       "stat",    S_IFREG|S_IRUGO),
-	E(PROC_TID_STATM,      "statm",   S_IFREG|S_IRUGO),
-	E(PROC_TID_MAPS,       "maps",    S_IFREG|S_IRUGO),
-#ifdef CONFIG_NUMA
-	E(PROC_TID_NUMA_MAPS,  "numa_maps",    S_IFREG|S_IRUGO),
-#endif
-	E(PROC_TID_MEM,        "mem",     S_IFREG|S_IRUSR|S_IWUSR),
-#ifdef CONFIG_SECCOMP
-	E(PROC_TID_SECCOMP,    "seccomp", S_IFREG|S_IRUSR|S_IWUSR),
-#endif
-	E(PROC_TID_CWD,        "cwd",     S_IFLNK|S_IRWXUGO),
-	E(PROC_TID_ROOT,       "root",    S_IFLNK|S_IRWXUGO),
-	E(PROC_TID_EXE,        "exe",     S_IFLNK|S_IRWXUGO),
-	E(PROC_TID_MOUNTS,     "mounts",  S_IFREG|S_IRUGO),
-#ifdef CONFIG_MMU
-	E(PROC_TID_SMAPS,      "smaps",   S_IFREG|S_IRUGO),
-#endif
-#ifdef CONFIG_SECURITY
-	E(PROC_TID_ATTR,       "attr",    S_IFDIR|S_IRUGO|S_IXUGO),
-#endif
-#ifdef CONFIG_KALLSYMS
-	E(PROC_TID_WCHAN,      "wchan",   S_IFREG|S_IRUGO),
-#endif
-#ifdef CONFIG_SCHEDSTATS
-	E(PROC_TID_SCHEDSTAT, "schedstat",S_IFREG|S_IRUGO),
-#endif
-#ifdef CONFIG_CPUSETS
-	E(PROC_TID_CPUSET,     "cpuset",  S_IFREG|S_IRUGO),
-#endif
-	E(PROC_TID_OOM_SCORE,  "oom_score",S_IFREG|S_IRUGO),
-	E(PROC_TID_OOM_ADJUST, "oom_adj", S_IFREG|S_IRUGO|S_IWUSR),
-#ifdef CONFIG_AUDITSYSCALL
-	E(PROC_TID_LOGINUID, "loginuid", S_IFREG|S_IWUSR|S_IRUGO),
-#endif
-	{0,0,NULL,0}
-};
-
-#ifdef CONFIG_SECURITY
-static struct pid_entry tgid_attr_stuff[] = {
-	E(PROC_TGID_ATTR_CURRENT,  "current",  S_IFREG|S_IRUGO|S_IWUGO),
-	E(PROC_TGID_ATTR_PREV,     "prev",     S_IFREG|S_IRUGO),
-	E(PROC_TGID_ATTR_EXEC,     "exec",     S_IFREG|S_IRUGO|S_IWUGO),
-	E(PROC_TGID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO),
-	E(PROC_TGID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO),
-	E(PROC_TGID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO),
-	{0,0,NULL,0}
-};
-static struct pid_entry tid_attr_stuff[] = {
-	E(PROC_TID_ATTR_CURRENT,   "current",  S_IFREG|S_IRUGO|S_IWUGO),
-	E(PROC_TID_ATTR_PREV,      "prev",     S_IFREG|S_IRUGO),
-	E(PROC_TID_ATTR_EXEC,      "exec",     S_IFREG|S_IRUGO|S_IWUGO),
-	E(PROC_TID_ATTR_FSCREATE,  "fscreate", S_IFREG|S_IRUGO|S_IWUGO),
-	E(PROC_TID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO),
-	E(PROC_TID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO),
-	{0,0,NULL,0}
-};
-#endif
-
-#undef E
-
-static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
-{
-	struct task_struct *task = get_proc_task(inode);
-	struct files_struct *files = NULL;
-	struct file *file;
-	int fd = proc_fd(inode);
-
-	if (task) {
-		files = get_files_struct(task);
-		put_task_struct(task);
-	}
-	if (files) {
-		/*
-		 * We are not taking a ref to the file structure, so we must
-		 * hold ->file_lock.
-		 */
-		spin_lock(&files->file_lock);
-		file = fcheck_files(files, fd);
-		if (file) {
-			*mnt = mntget(file->f_vfsmnt);
-			*dentry = dget(file->f_dentry);
-			spin_unlock(&files->file_lock);
-			put_files_struct(files);
-			return 0;
-		}
-		spin_unlock(&files->file_lock);
-		put_files_struct(files);
-	}
-	return -ENOENT;
+#define NOD(NAME, MODE, IOP, FOP, OP) {			\
+	.len  = sizeof(NAME) - 1,			\
+	.name = (NAME),					\
+	.mode = MODE,					\
+	.iop  = IOP,					\
+	.fop  = FOP,					\
+	.op   = OP,					\
 }
 
+#define DIR(NAME, MODE, OTYPE)							\
+	NOD(NAME, (S_IFDIR|(MODE)),						\
+		&proc_##OTYPE##_inode_operations, &proc_##OTYPE##_operations,	\
+		{} )
+#define LNK(NAME, OTYPE)					\
+	NOD(NAME, (S_IFLNK|S_IRWXUGO),				\
+		&proc_pid_link_inode_operations, NULL,		\
+		{ .proc_get_link = &proc_##OTYPE##_link } )
+#define REG(NAME, MODE, OTYPE)				\
+	NOD(NAME, (S_IFREG|(MODE)), NULL,		\
+		&proc_##OTYPE##_operations, {})
+#define INF(NAME, MODE, OTYPE)				\
+	NOD(NAME, (S_IFREG|(MODE)), 			\
+		NULL, &proc_info_file_operations,	\
+		{ .proc_read = &proc_##OTYPE } )
+
 static struct fs_struct *get_fs_struct(struct task_struct *task)
 {
 	struct fs_struct *fs;
@@ -587,7 +371,7 @@ static int mounts_open(struct inode *inode, struct file *file)
 
 	if (task) {
 		task_lock(task);
-		namespace = task->namespace;
+		namespace = task->nsproxy->namespace;
 		if (namespace)
 			get_namespace(namespace);
 		task_unlock(task);
@@ -658,7 +442,7 @@ static int mountstats_open(struct inode *inode, struct file *file)
 
 		if (task) {
 			task_lock(task);
-			namespace = task->namespace;
+			namespace = task->nsproxy->namespace;
 			if (namespace)
 				get_namespace(namespace);
 			task_unlock(task);
@@ -797,7 +581,7 @@ out_no_task:
 static ssize_t mem_write(struct file * file, const char * buf,
 			 size_t count, loff_t *ppos)
 {
-	int copied = 0;
+	int copied;
 	char *page;
 	struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
 	unsigned long dst = *ppos;
@@ -814,6 +598,7 @@ static ssize_t mem_write(struct file * file, const char * buf,
 	if (!page)
 		goto out;
 
+	copied = 0;
 	while (count > 0) {
 		int this_len, retval;
 
@@ -905,7 +690,8 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 	if (copy_from_user(buffer, buf, count))
 		return -EFAULT;
 	oom_adjust = simple_strtol(buffer, &end, 0);
-	if ((oom_adjust < -16 || oom_adjust > 15) && oom_adjust != OOM_DISABLE)
+	if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
+	     oom_adjust != OOM_DISABLE)
 		return -EINVAL;
 	if (*end == '\n')
 		end++;
@@ -1136,143 +922,6 @@ static struct inode_operations proc_pid_link_inode_operations = {
 	.setattr	= proc_setattr,
 };
 
-static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
-{
-	struct dentry *dentry = filp->f_dentry;
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *p = get_proc_task(inode);
-	unsigned int fd, tid, ino;
-	int retval;
-	char buf[PROC_NUMBUF];
-	struct files_struct * files;
-	struct fdtable *fdt;
-
-	retval = -ENOENT;
-	if (!p)
-		goto out_no_task;
-	retval = 0;
-	tid = p->pid;
-
-	fd = filp->f_pos;
-	switch (fd) {
-		case 0:
-			if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
-				goto out;
-			filp->f_pos++;
-		case 1:
-			ino = parent_ino(dentry);
-			if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
-				goto out;
-			filp->f_pos++;
-		default:
-			files = get_files_struct(p);
-			if (!files)
-				goto out;
-			rcu_read_lock();
-			fdt = files_fdtable(files);
-			for (fd = filp->f_pos-2;
-			     fd < fdt->max_fds;
-			     fd++, filp->f_pos++) {
-				unsigned int i,j;
-
-				if (!fcheck_files(files, fd))
-					continue;
-				rcu_read_unlock();
-
-				j = PROC_NUMBUF;
-				i = fd;
-				do {
-					j--;
-					buf[j] = '0' + (i % 10);
-					i /= 10;
-				} while (i);
-
-				ino = fake_ino(tid, PROC_TID_FD_DIR + fd);
-				if (filldir(dirent, buf+j, PROC_NUMBUF-j, fd+2, ino, DT_LNK) < 0) {
-					rcu_read_lock();
-					break;
-				}
-				rcu_read_lock();
-			}
-			rcu_read_unlock();
-			put_files_struct(files);
-	}
-out:
-	put_task_struct(p);
-out_no_task:
-	return retval;
-}
-
-static int proc_pident_readdir(struct file *filp,
-		void *dirent, filldir_t filldir,
-		struct pid_entry *ents, unsigned int nents)
-{
-	int i;
-	int pid;
-	struct dentry *dentry = filp->f_dentry;
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = get_proc_task(inode);
-	struct pid_entry *p;
-	ino_t ino;
-	int ret;
-
-	ret = -ENOENT;
-	if (!task)
-		goto out;
-
-	ret = 0;
-	pid = task->pid;
-	put_task_struct(task);
-	i = filp->f_pos;
-	switch (i) {
-	case 0:
-		ino = inode->i_ino;
-		if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
-			goto out;
-		i++;
-		filp->f_pos++;
-		/* fall through */
-	case 1:
-		ino = parent_ino(dentry);
-		if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
-			goto out;
-		i++;
-		filp->f_pos++;
-		/* fall through */
-	default:
-		i -= 2;
-		if (i >= nents) {
-			ret = 1;
-			goto out;
-		}
-		p = ents + i;
-		while (p->name) {
-			if (filldir(dirent, p->name, p->len, filp->f_pos,
-				    fake_ino(pid, p->type), p->mode >> 12) < 0)
-				goto out;
-			filp->f_pos++;
-			p++;
-		}
-	}
-
-	ret = 1;
-out:
-	return ret;
-}
-
-static int proc_tgid_base_readdir(struct file * filp,
-			     void * dirent, filldir_t filldir)
-{
-	return proc_pident_readdir(filp,dirent,filldir,
-				   tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff));
-}
-
-static int proc_tid_base_readdir(struct file * filp,
-			     void * dirent, filldir_t filldir)
-{
-	return proc_pident_readdir(filp,dirent,filldir,
-				   tid_base_stuff,ARRAY_SIZE(tid_base_stuff));
-}
 
 /* building an inode */
 
@@ -1292,13 +941,13 @@ static int task_dumpable(struct task_struct *task)
 }
 
 
-static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task, int ino)
+static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
 {
 	struct inode * inode;
 	struct proc_inode *ei;
 
 	/* We need a new inode */
-	
+
 	inode = new_inode(sb);
 	if (!inode)
 		goto out;
@@ -1306,13 +955,12 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 	/* Common stuff */
 	ei = PROC_I(inode);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-	inode->i_ino = fake_ino(task->pid, ino);
 	inode->i_op = &proc_def_inode_operations;
 
 	/*
 	 * grab the reference to task.
 	 */
-	ei->pid = get_pid(task->pids[PIDTYPE_PID].pid);
+	ei->pid = get_task_pid(task, PIDTYPE_PID);
 	if (!ei->pid)
 		goto out_unlock;
 
@@ -1332,6 +980,27 @@ out_unlock:
 	return NULL;
 }
 
+static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	generic_fillattr(inode, stat);
+
+	rcu_read_lock();
+	stat->uid = 0;
+	stat->gid = 0;
+	task = pid_task(proc_pid(inode), PIDTYPE_PID);
+	if (task) {
+		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+		    task_dumpable(task)) {
+			stat->uid = task->euid;
+			stat->gid = task->egid;
+		}
+	}
+	rcu_read_unlock();
+	return 0;
+}
+
 /* dentry stuff */
 
 /*
@@ -1371,25 +1040,130 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return 0;
 }
 
-static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+static int pid_delete_dentry(struct dentry * dentry)
 {
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *task;
-	generic_fillattr(inode, stat);
+	/* Is the task we represent dead?
+	 * If so, then don't put the dentry on the lru list,
+	 * kill it immediately.
+	 */
+	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
+}
+
+static struct dentry_operations pid_dentry_operations =
+{
+	.d_revalidate	= pid_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+/* Lookups */
+
+typedef struct dentry *instantiate_t(struct inode *, struct dentry *, struct task_struct *, void *);
+
+/*
+ * Fill a directory entry.
+ *
+ * If possible create the dcache entry and derive our inode number and
+ * file type from dcache entry.
+ *
+ * Since all of the proc inode numbers are dynamically generated, the inode
+ * numbers do not exist until the inode is cache.  This means creating the
+ * the dcache entry in readdir is necessary to keep the inode numbers
+ * reported by readdir in sync with the inode numbers reported
+ * by stat.
+ */
+static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+	char *name, int len,
+	instantiate_t instantiate, struct task_struct *task, void *ptr)
+{
+	struct dentry *child, *dir = filp->f_dentry;
+	struct inode *inode;
+	struct qstr qname;
+	ino_t ino = 0;
+	unsigned type = DT_UNKNOWN;
+
+	qname.name = name;
+	qname.len  = len;
+	qname.hash = full_name_hash(name, len);
+
+	child = d_lookup(dir, &qname);
+	if (!child) {
+		struct dentry *new;
+		new = d_alloc(dir, &qname);
+		if (new) {
+			child = instantiate(dir->d_inode, new, task, ptr);
+			if (child)
+				dput(new);
+			else
+				child = new;
+		}
+	}
+	if (!child || IS_ERR(child) || !child->d_inode)
+		goto end_instantiate;
+	inode = child->d_inode;
+	if (inode) {
+		ino = inode->i_ino;
+		type = inode->i_mode >> 12;
+	}
+	dput(child);
+end_instantiate:
+	if (!ino)
+		ino = find_inode_number(dir, &qname);
+	if (!ino)
+		ino = 1;
+	return filldir(dirent, name, len, filp->f_pos, ino, type);
+}
+
+static unsigned name_to_int(struct dentry *dentry)
+{
+	const char *name = dentry->d_name.name;
+	int len = dentry->d_name.len;
+	unsigned n = 0;
+
+	if (len > 1 && *name == '0')
+		goto out;
+	while (len-- > 0) {
+		unsigned c = *name++ - '0';
+		if (c > 9)
+			goto out;
+		if (n >= (~0U-9)/10)
+			goto out;
+		n *= 10;
+		n += c;
+	}
+	return n;
+out:
+	return ~0U;
+}
+
+static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
+{
+	struct task_struct *task = get_proc_task(inode);
+	struct files_struct *files = NULL;
+	struct file *file;
+	int fd = proc_fd(inode);
 
-	rcu_read_lock();
-	stat->uid = 0;
-	stat->gid = 0;
-	task = pid_task(proc_pid(inode), PIDTYPE_PID);
 	if (task) {
-		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
-		    task_dumpable(task)) {
-			stat->uid = task->euid;
-			stat->gid = task->egid;
+		files = get_files_struct(task);
+		put_task_struct(task);
+	}
+	if (files) {
+		/*
+		 * We are not taking a ref to the file structure, so we must
+		 * hold ->file_lock.
+		 */
+		spin_lock(&files->file_lock);
+		file = fcheck_files(files, fd);
+		if (file) {
+			*mnt = mntget(file->f_vfsmnt);
+			*dentry = dget(file->f_dentry);
+			spin_unlock(&files->file_lock);
+			put_files_struct(files);
+			return 0;
 		}
+		spin_unlock(&files->file_lock);
+		put_files_struct(files);
 	}
-	rcu_read_unlock();
-	return 0;
+	return -ENOENT;
 }
 
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
@@ -1427,75 +1201,30 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return 0;
 }
 
-static int pid_delete_dentry(struct dentry * dentry)
-{
-	/* Is the task we represent dead?
-	 * If so, then don't put the dentry on the lru list,
-	 * kill it immediately.
-	 */
-	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
-}
-
 static struct dentry_operations tid_fd_dentry_operations =
 {
 	.d_revalidate	= tid_fd_revalidate,
 	.d_delete	= pid_delete_dentry,
 };
 
-static struct dentry_operations pid_dentry_operations =
-{
-	.d_revalidate	= pid_revalidate,
-	.d_delete	= pid_delete_dentry,
-};
-
-/* Lookups */
-
-static unsigned name_to_int(struct dentry *dentry)
-{
-	const char *name = dentry->d_name.name;
-	int len = dentry->d_name.len;
-	unsigned n = 0;
-
-	if (len > 1 && *name == '0')
-		goto out;
-	while (len-- > 0) {
-		unsigned c = *name++ - '0';
-		if (c > 9)
-			goto out;
-		if (n >= (~0U-9)/10)
-			goto out;
-		n *= 10;
-		n += c;
-	}
-	return n;
-out:
-	return ~0U;
-}
-
-/* SMP-safe */
-static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
+static struct dentry *proc_fd_instantiate(struct inode *dir,
+	struct dentry *dentry, struct task_struct *task, void *ptr)
 {
-	struct task_struct *task = get_proc_task(dir);
-	unsigned fd = name_to_int(dentry);
-	struct dentry *result = ERR_PTR(-ENOENT);
-	struct file * file;
-	struct files_struct * files;
-	struct inode *inode;
-	struct proc_inode *ei;
-
-	if (!task)
-		goto out_no_task;
-	if (fd == ~0U)
-		goto out;
+	unsigned fd = *(unsigned *)ptr;
+	struct file *file;
+	struct files_struct *files;
+ 	struct inode *inode;
+ 	struct proc_inode *ei;
+	struct dentry *error = ERR_PTR(-ENOENT);
 
-	inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd);
+	inode = proc_pid_make_inode(dir->i_sb, task);
 	if (!inode)
 		goto out;
 	ei = PROC_I(inode);
 	ei->fd = fd;
 	files = get_files_struct(task);
 	if (!files)
-		goto out_unlock;
+		goto out_iput;
 	inode->i_mode = S_IFLNK;
 
 	/*
@@ -1505,13 +1234,14 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	spin_lock(&files->file_lock);
 	file = fcheck_files(files, fd);
 	if (!file)
-		goto out_unlock2;
+		goto out_unlock;
 	if (file->f_mode & 1)
 		inode->i_mode |= S_IRUSR | S_IXUSR;
 	if (file->f_mode & 2)
 		inode->i_mode |= S_IWUSR | S_IXUSR;
 	spin_unlock(&files->file_lock);
 	put_files_struct(files);
+
 	inode->i_op = &proc_pid_link_inode_operations;
 	inode->i_size = 64;
 	ei->op.proc_get_link = proc_fd_link;
@@ -1519,34 +1249,106 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
 	if (tid_fd_revalidate(dentry, NULL))
-		result = NULL;
-out:
-	put_task_struct(task);
-out_no_task:
-	return result;
+		error = NULL;
 
-out_unlock2:
+ out:
+	return error;
+out_unlock:
 	spin_unlock(&files->file_lock);
 	put_files_struct(files);
-out_unlock:
+out_iput:
 	iput(inode);
 	goto out;
 }
 
-static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir);
-static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd);
-static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
+static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
+{
+	struct task_struct *task = get_proc_task(dir);
+	unsigned fd = name_to_int(dentry);
+	struct dentry *result = ERR_PTR(-ENOENT);
+
+	if (!task)
+		goto out_no_task;
+	if (fd == ~0U)
+		goto out;
+
+	result = proc_fd_instantiate(dir, dentry, task, &fd);
+out:
+	put_task_struct(task);
+out_no_task:
+	return result;
+}
+
+static int proc_fd_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+	struct task_struct *task, int fd)
+{
+	char name[PROC_NUMBUF];
+	int len = snprintf(name, sizeof(name), "%d", fd);
+	return proc_fill_cache(filp, dirent, filldir, name, len,
+				proc_fd_instantiate, task, &fd);
+}
+
+static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_dentry;
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *p = get_proc_task(inode);
+	unsigned int fd, tid, ino;
+	int retval;
+	struct files_struct * files;
+	struct fdtable *fdt;
+
+	retval = -ENOENT;
+	if (!p)
+		goto out_no_task;
+	retval = 0;
+	tid = p->pid;
+
+	fd = filp->f_pos;
+	switch (fd) {
+		case 0:
+			if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
+				goto out;
+			filp->f_pos++;
+		case 1:
+			ino = parent_ino(dentry);
+			if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+				goto out;
+			filp->f_pos++;
+		default:
+			files = get_files_struct(p);
+			if (!files)
+				goto out;
+			rcu_read_lock();
+			fdt = files_fdtable(files);
+			for (fd = filp->f_pos-2;
+			     fd < fdt->max_fds;
+			     fd++, filp->f_pos++) {
+
+				if (!fcheck_files(files, fd))
+					continue;
+				rcu_read_unlock();
+
+				if (proc_fd_fill_cache(filp, dirent, filldir, p, fd) < 0) {
+					rcu_read_lock();
+					break;
+				}
+				rcu_read_lock();
+			}
+			rcu_read_unlock();
+			put_files_struct(files);
+	}
+out:
+	put_task_struct(p);
+out_no_task:
+	return retval;
+}
 
 static struct file_operations proc_fd_operations = {
 	.read		= generic_read_dir,
 	.readdir	= proc_readfd,
 };
 
-static struct file_operations proc_task_operations = {
-	.read		= generic_read_dir,
-	.readdir	= proc_task_readdir,
-};
-
 /*
  * proc directories can do almost nothing..
  */
@@ -1555,11 +1357,137 @@ static struct inode_operations proc_fd_inode_operations = {
 	.setattr	= proc_setattr,
 };
 
-static struct inode_operations proc_task_inode_operations = {
-	.lookup		= proc_task_lookup,
-	.getattr	= proc_task_getattr,
-	.setattr	= proc_setattr,
-};
+static struct dentry *proc_pident_instantiate(struct inode *dir,
+	struct dentry *dentry, struct task_struct *task, void *ptr)
+{
+	struct pid_entry *p = ptr;
+	struct inode *inode;
+	struct proc_inode *ei;
+	struct dentry *error = ERR_PTR(-EINVAL);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		goto out;
+
+	ei = PROC_I(inode);
+	inode->i_mode = p->mode;
+	if (S_ISDIR(inode->i_mode))
+		inode->i_nlink = 2;	/* Use getattr to fix if necessary */
+	if (p->iop)
+		inode->i_op = p->iop;
+	if (p->fop)
+		inode->i_fop = p->fop;
+	ei->op = p->op;
+	dentry->d_op = &pid_dentry_operations;
+	d_add(dentry, inode);
+	/* Close the race of the process dying before we return the dentry */
+	if (pid_revalidate(dentry, NULL))
+		error = NULL;
+out:
+	return error;
+}
+
+static struct dentry *proc_pident_lookup(struct inode *dir, 
+					 struct dentry *dentry,
+					 struct pid_entry *ents,
+					 unsigned int nents)
+{
+	struct inode *inode;
+	struct dentry *error;
+	struct task_struct *task = get_proc_task(dir);
+	struct pid_entry *p, *last;
+
+	error = ERR_PTR(-ENOENT);
+	inode = NULL;
+
+	if (!task)
+		goto out_no_task;
+
+	/*
+	 * Yes, it does not scale. And it should not. Don't add
+	 * new entries into /proc/<tgid>/ without very good reasons.
+	 */
+	last = &ents[nents - 1];
+	for (p = ents; p <= last; p++) {
+		if (p->len != dentry->d_name.len)
+			continue;
+		if (!memcmp(dentry->d_name.name, p->name, p->len))
+			break;
+	}
+	if (p > last)
+		goto out;
+
+	error = proc_pident_instantiate(dir, dentry, task, p);
+out:
+	put_task_struct(task);
+out_no_task:
+	return error;
+}
+
+static int proc_pident_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+	struct task_struct *task, struct pid_entry *p)
+{
+	return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
+				proc_pident_instantiate, task, p);
+}
+
+static int proc_pident_readdir(struct file *filp,
+		void *dirent, filldir_t filldir,
+		struct pid_entry *ents, unsigned int nents)
+{
+	int i;
+	int pid;
+	struct dentry *dentry = filp->f_dentry;
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task = get_proc_task(inode);
+	struct pid_entry *p, *last;
+	ino_t ino;
+	int ret;
+
+	ret = -ENOENT;
+	if (!task)
+		goto out_no_task;
+
+	ret = 0;
+	pid = task->pid;
+	i = filp->f_pos;
+	switch (i) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+			goto out;
+		i++;
+		filp->f_pos++;
+		/* fall through */
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+			goto out;
+		i++;
+		filp->f_pos++;
+		/* fall through */
+	default:
+		i -= 2;
+		if (i >= nents) {
+			ret = 1;
+			goto out;
+		}
+		p = ents + i;
+		last = &ents[nents - 1];
+		while (p <= last) {
+			if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0)
+				goto out;
+			filp->f_pos++;
+			p++;
+		}
+	}
+
+	ret = 1;
+out:
+	put_task_struct(task);
+out_no_task:
+	return ret;
+}
 
 #ifdef CONFIG_SECURITY
 static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
@@ -1580,8 +1508,8 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
 	if (!(page = __get_free_page(GFP_KERNEL)))
 		goto out;
 
-	length = security_getprocattr(task, 
-				      (char*)file->f_dentry->d_name.name, 
+	length = security_getprocattr(task,
+				      (char*)file->f_dentry->d_name.name,
 				      (void*)page, count);
 	if (length >= 0)
 		length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
@@ -1594,17 +1522,17 @@ out_no_task:
 
 static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 				   size_t count, loff_t *ppos)
-{ 
+{
 	struct inode * inode = file->f_dentry->d_inode;
-	char *page; 
-	ssize_t length; 
+	char *page;
+	ssize_t length;
 	struct task_struct *task = get_proc_task(inode);
 
 	length = -ESRCH;
 	if (!task)
 		goto out_no_task;
-	if (count > PAGE_SIZE) 
-		count = PAGE_SIZE; 
+	if (count > PAGE_SIZE)
+		count = PAGE_SIZE;
 
 	/* No partial writes. */
 	length = -EINVAL;
@@ -1612,16 +1540,16 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 		goto out;
 
 	length = -ENOMEM;
-	page = (char*)__get_free_page(GFP_USER); 
-	if (!page) 
+	page = (char*)__get_free_page(GFP_USER);
+	if (!page)
 		goto out;
 
-	length = -EFAULT; 
-	if (copy_from_user(page, buf, count)) 
+	length = -EFAULT;
+	if (copy_from_user(page, buf, count))
 		goto out_free;
 
-	length = security_setprocattr(task, 
-				      (char*)file->f_dentry->d_name.name, 
+	length = security_setprocattr(task,
+				      (char*)file->f_dentry->d_name.name,
 				      (void*)page, count);
 out_free:
 	free_page((unsigned long) page);
@@ -1629,330 +1557,263 @@ out:
 	put_task_struct(task);
 out_no_task:
 	return length;
-} 
+}
 
 static struct file_operations proc_pid_attr_operations = {
 	.read		= proc_pid_attr_read,
 	.write		= proc_pid_attr_write,
 };
 
-static struct file_operations proc_tid_attr_operations;
-static struct inode_operations proc_tid_attr_inode_operations;
-static struct file_operations proc_tgid_attr_operations;
-static struct inode_operations proc_tgid_attr_inode_operations;
+static struct pid_entry attr_dir_stuff[] = {
+	REG("current",    S_IRUGO|S_IWUGO, pid_attr),
+	REG("prev",       S_IRUGO,	   pid_attr),
+	REG("exec",       S_IRUGO|S_IWUGO, pid_attr),
+	REG("fscreate",   S_IRUGO|S_IWUGO, pid_attr),
+	REG("keycreate",  S_IRUGO|S_IWUGO, pid_attr),
+	REG("sockcreate", S_IRUGO|S_IWUGO, pid_attr),
+};
+
+static int proc_attr_dir_readdir(struct file * filp,
+			     void * dirent, filldir_t filldir)
+{
+	return proc_pident_readdir(filp,dirent,filldir,
+				   attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff));
+}
+
+static struct file_operations proc_attr_dir_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_attr_dir_readdir,
+};
+
+static struct dentry *proc_attr_dir_lookup(struct inode *dir,
+				struct dentry *dentry, struct nameidata *nd)
+{
+	return proc_pident_lookup(dir, dentry,
+				  attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
+}
+
+static struct inode_operations proc_attr_dir_inode_operations = {
+	.lookup		= proc_attr_dir_lookup,
+	.getattr	= pid_getattr,
+	.setattr	= proc_setattr,
+};
+
 #endif
 
-/* SMP-safe */
-static struct dentry *proc_pident_lookup(struct inode *dir, 
-					 struct dentry *dentry,
-					 struct pid_entry *ents)
+/*
+ * /proc/self:
+ */
+static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
+			      int buflen)
+{
+	char tmp[PROC_NUMBUF];
+	sprintf(tmp, "%d", current->tgid);
+	return vfs_readlink(dentry,buffer,buflen,tmp);
+}
+
+static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
+	char tmp[PROC_NUMBUF];
+	sprintf(tmp, "%d", current->tgid);
+	return ERR_PTR(vfs_follow_link(nd,tmp));
+}
+
+static struct inode_operations proc_self_inode_operations = {
+	.readlink	= proc_self_readlink,
+	.follow_link	= proc_self_follow_link,
+};
+
+/*
+ * proc base
+ *
+ * These are the directory entries in the root directory of /proc
+ * that properly belong to the /proc filesystem, as they describe
+ * describe something that is process related.
+ */
+static struct pid_entry proc_base_stuff[] = {
+	NOD("self", S_IFLNK|S_IRWXUGO,
+		&proc_self_inode_operations, NULL, {}),
+};
+
+/*
+ *	Exceptional case: normally we are not allowed to unhash a busy
+ * directory. In this case, however, we can do it - no aliasing problems
+ * due to the way we treat inodes.
+ */
+static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task = get_proc_task(inode);
+	if (task) {
+		put_task_struct(task);
+		return 1;
+	}
+	d_drop(dentry);
+	return 0;
+}
+
+static struct dentry_operations proc_base_dentry_operations =
+{
+	.d_revalidate	= proc_base_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static struct dentry *proc_base_instantiate(struct inode *dir,
+	struct dentry *dentry, struct task_struct *task, void *ptr)
+{
+	struct pid_entry *p = ptr;
 	struct inode *inode;
+	struct proc_inode *ei;
+	struct dentry *error = ERR_PTR(-EINVAL);
+
+	/* Allocate the inode */
+	error = ERR_PTR(-ENOMEM);
+	inode = new_inode(dir->i_sb);
+	if (!inode)
+		goto out;
+
+	/* Initialize the inode */
+	ei = PROC_I(inode);
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+	/*
+	 * grab the reference to the task.
+	 */
+	ei->pid = get_task_pid(task, PIDTYPE_PID);
+	if (!ei->pid)
+		goto out_iput;
+
+	inode->i_uid = 0;
+	inode->i_gid = 0;
+	inode->i_mode = p->mode;
+	if (S_ISDIR(inode->i_mode))
+		inode->i_nlink = 2;
+	if (S_ISLNK(inode->i_mode))
+		inode->i_size = 64;
+	if (p->iop)
+		inode->i_op = p->iop;
+	if (p->fop)
+		inode->i_fop = p->fop;
+	ei->op = p->op;
+	dentry->d_op = &proc_base_dentry_operations;
+	d_add(dentry, inode);
+	error = NULL;
+out:
+	return error;
+out_iput:
+	iput(inode);
+	goto out;
+}
+
+static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
+{
 	struct dentry *error;
 	struct task_struct *task = get_proc_task(dir);
-	struct pid_entry *p;
-	struct proc_inode *ei;
+	struct pid_entry *p, *last;
 
 	error = ERR_PTR(-ENOENT);
-	inode = NULL;
 
 	if (!task)
 		goto out_no_task;
 
-	for (p = ents; p->name; p++) {
+	/* Lookup the directory entry */
+	last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1];
+	for (p = proc_base_stuff; p <= last; p++) {
 		if (p->len != dentry->d_name.len)
 			continue;
 		if (!memcmp(dentry->d_name.name, p->name, p->len))
 			break;
 	}
-	if (!p->name)
+	if (p > last)
 		goto out;
 
-	error = ERR_PTR(-EINVAL);
-	inode = proc_pid_make_inode(dir->i_sb, task, p->type);
-	if (!inode)
-		goto out;
+	error = proc_base_instantiate(dir, dentry, task, p);
 
-	ei = PROC_I(inode);
-	inode->i_mode = p->mode;
-	/*
-	 * Yes, it does not scale. And it should not. Don't add
-	 * new entries into /proc/<tgid>/ without very good reasons.
-	 */
-	switch(p->type) {
-		case PROC_TGID_TASK:
-			inode->i_nlink = 2;
-			inode->i_op = &proc_task_inode_operations;
-			inode->i_fop = &proc_task_operations;
-			break;
-		case PROC_TID_FD:
-		case PROC_TGID_FD:
-			inode->i_nlink = 2;
-			inode->i_op = &proc_fd_inode_operations;
-			inode->i_fop = &proc_fd_operations;
-			break;
-		case PROC_TID_EXE:
-		case PROC_TGID_EXE:
-			inode->i_op = &proc_pid_link_inode_operations;
-			ei->op.proc_get_link = proc_exe_link;
-			break;
-		case PROC_TID_CWD:
-		case PROC_TGID_CWD:
-			inode->i_op = &proc_pid_link_inode_operations;
-			ei->op.proc_get_link = proc_cwd_link;
-			break;
-		case PROC_TID_ROOT:
-		case PROC_TGID_ROOT:
-			inode->i_op = &proc_pid_link_inode_operations;
-			ei->op.proc_get_link = proc_root_link;
-			break;
-		case PROC_TID_ENVIRON:
-		case PROC_TGID_ENVIRON:
-			inode->i_fop = &proc_info_file_operations;
-			ei->op.proc_read = proc_pid_environ;
-			break;
-		case PROC_TID_AUXV:
-		case PROC_TGID_AUXV:
-			inode->i_fop = &proc_info_file_operations;
-			ei->op.proc_read = proc_pid_auxv;
-			break;
-		case PROC_TID_STATUS:
-		case PROC_TGID_STATUS:
-			inode->i_fop = &proc_info_file_operations;
-			ei->op.proc_read = proc_pid_status;
-			break;
-		case PROC_TID_STAT:
-			inode->i_fop = &proc_info_file_operations;
-			ei->op.proc_read = proc_tid_stat;
-			break;
-		case PROC_TGID_STAT:
-			inode->i_fop = &proc_info_file_operations;
-			ei->op.proc_read = proc_tgid_stat;
-			break;
-		case PROC_TID_CMDLINE:
-		case PROC_TGID_CMDLINE:
-			inode->i_fop = &proc_info_file_operations;
-			ei->op.proc_read = proc_pid_cmdline;
-			break;
-		case PROC_TID_STATM:
-		case PROC_TGID_STATM:
-			inode->i_fop = &proc_info_file_operations;
-			ei->op.proc_read = proc_pid_statm;
-			break;
-		case PROC_TID_MAPS:
-		case PROC_TGID_MAPS:
-			inode->i_fop = &proc_maps_operations;
-			break;
+out:
+	put_task_struct(task);
+out_no_task:
+	return error;
+}
+
+static int proc_base_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+	struct task_struct *task, struct pid_entry *p)
+{
+	return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
+				proc_base_instantiate, task, p);
+}
+
+/*
+ * Thread groups
+ */
+static struct file_operations proc_task_operations;
+static struct inode_operations proc_task_inode_operations;
+
+static struct pid_entry tgid_base_stuff[] = {
+	DIR("task",       S_IRUGO|S_IXUGO, task),
+	DIR("fd",         S_IRUSR|S_IXUSR, fd),
+	INF("environ",    S_IRUSR, pid_environ),
+	INF("auxv",       S_IRUSR, pid_auxv),
+	INF("status",     S_IRUGO, pid_status),
+	INF("cmdline",    S_IRUGO, pid_cmdline),
+	INF("stat",       S_IRUGO, tgid_stat),
+	INF("statm",      S_IRUGO, pid_statm),
+	REG("maps",       S_IRUGO, maps),
 #ifdef CONFIG_NUMA
-		case PROC_TID_NUMA_MAPS:
-		case PROC_TGID_NUMA_MAPS:
-			inode->i_fop = &proc_numa_maps_operations;
-			break;
+	REG("numa_maps",  S_IRUGO, numa_maps),
 #endif
-		case PROC_TID_MEM:
-		case PROC_TGID_MEM:
-			inode->i_fop = &proc_mem_operations;
-			break;
+	REG("mem",        S_IRUSR|S_IWUSR, mem),
 #ifdef CONFIG_SECCOMP
-		case PROC_TID_SECCOMP:
-		case PROC_TGID_SECCOMP:
-			inode->i_fop = &proc_seccomp_operations;
-			break;
-#endif /* CONFIG_SECCOMP */
-		case PROC_TID_MOUNTS:
-		case PROC_TGID_MOUNTS:
-			inode->i_fop = &proc_mounts_operations;
-			break;
+	REG("seccomp",    S_IRUSR|S_IWUSR, seccomp),
+#endif
+	LNK("cwd",        cwd),
+	LNK("root",       root),
+	LNK("exe",        exe),
+	REG("mounts",     S_IRUGO, mounts),
+	REG("mountstats", S_IRUSR, mountstats),
 #ifdef CONFIG_MMU
-		case PROC_TID_SMAPS:
-		case PROC_TGID_SMAPS:
-			inode->i_fop = &proc_smaps_operations;
-			break;
+	REG("smaps",      S_IRUGO, smaps),
 #endif
-		case PROC_TID_MOUNTSTATS:
-		case PROC_TGID_MOUNTSTATS:
-			inode->i_fop = &proc_mountstats_operations;
-			break;
 #ifdef CONFIG_SECURITY
-		case PROC_TID_ATTR:
-			inode->i_nlink = 2;
-			inode->i_op = &proc_tid_attr_inode_operations;
-			inode->i_fop = &proc_tid_attr_operations;
-			break;
-		case PROC_TGID_ATTR:
-			inode->i_nlink = 2;
-			inode->i_op = &proc_tgid_attr_inode_operations;
-			inode->i_fop = &proc_tgid_attr_operations;
-			break;
-		case PROC_TID_ATTR_CURRENT:
-		case PROC_TGID_ATTR_CURRENT:
-		case PROC_TID_ATTR_PREV:
-		case PROC_TGID_ATTR_PREV:
-		case PROC_TID_ATTR_EXEC:
-		case PROC_TGID_ATTR_EXEC:
-		case PROC_TID_ATTR_FSCREATE:
-		case PROC_TGID_ATTR_FSCREATE:
-		case PROC_TID_ATTR_KEYCREATE:
-		case PROC_TGID_ATTR_KEYCREATE:
-		case PROC_TID_ATTR_SOCKCREATE:
-		case PROC_TGID_ATTR_SOCKCREATE:
-			inode->i_fop = &proc_pid_attr_operations;
-			break;
+	DIR("attr",       S_IRUGO|S_IXUGO, attr_dir),
 #endif
 #ifdef CONFIG_KALLSYMS
-		case PROC_TID_WCHAN:
-		case PROC_TGID_WCHAN:
-			inode->i_fop = &proc_info_file_operations;
-			ei->op.proc_read = proc_pid_wchan;
-			break;
+	INF("wchan",      S_IRUGO, pid_wchan),
 #endif
 #ifdef CONFIG_SCHEDSTATS
-		case PROC_TID_SCHEDSTAT:
-		case PROC_TGID_SCHEDSTAT:
-			inode->i_fop = &proc_info_file_operations;
-			ei->op.proc_read = proc_pid_schedstat;
-			break;
+	INF("schedstat",  S_IRUGO, pid_schedstat),
 #endif
 #ifdef CONFIG_CPUSETS
-		case PROC_TID_CPUSET:
-		case PROC_TGID_CPUSET:
-			inode->i_fop = &proc_cpuset_operations;
-			break;
+	REG("cpuset",     S_IRUGO, cpuset),
 #endif
-		case PROC_TID_OOM_SCORE:
-		case PROC_TGID_OOM_SCORE:
-			inode->i_fop = &proc_info_file_operations;
-			ei->op.proc_read = proc_oom_score;
-			break;
-		case PROC_TID_OOM_ADJUST:
-		case PROC_TGID_OOM_ADJUST:
-			inode->i_fop = &proc_oom_adjust_operations;
-			break;
+	INF("oom_score",  S_IRUGO, oom_score),
+	REG("oom_adj",    S_IRUGO|S_IWUSR, oom_adjust),
 #ifdef CONFIG_AUDITSYSCALL
-		case PROC_TID_LOGINUID:
-		case PROC_TGID_LOGINUID:
-			inode->i_fop = &proc_loginuid_operations;
-			break;
+	REG("loginuid",   S_IWUSR|S_IRUGO, loginuid),
 #endif
-		default:
-			printk("procfs: impossible type (%d)",p->type);
-			iput(inode);
-			error = ERR_PTR(-EINVAL);
-			goto out;
-	}
-	dentry->d_op = &pid_dentry_operations;
-	d_add(dentry, inode);
-	/* Close the race of the process dying before we return the dentry */
-	if (pid_revalidate(dentry, NULL))
-		error = NULL;
-out:
-	put_task_struct(task);
-out_no_task:
-	return error;
-}
-
-static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
-	return proc_pident_lookup(dir, dentry, tgid_base_stuff);
-}
-
-static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
-	return proc_pident_lookup(dir, dentry, tid_base_stuff);
-}
-
-static struct file_operations proc_tgid_base_operations = {
-	.read		= generic_read_dir,
-	.readdir	= proc_tgid_base_readdir,
 };
 
-static struct file_operations proc_tid_base_operations = {
-	.read		= generic_read_dir,
-	.readdir	= proc_tid_base_readdir,
-};
-
-static struct inode_operations proc_tgid_base_inode_operations = {
-	.lookup		= proc_tgid_base_lookup,
-	.getattr	= pid_getattr,
-	.setattr	= proc_setattr,
-};
-
-static struct inode_operations proc_tid_base_inode_operations = {
-	.lookup		= proc_tid_base_lookup,
-	.getattr	= pid_getattr,
-	.setattr	= proc_setattr,
-};
-
-#ifdef CONFIG_SECURITY
-static int proc_tgid_attr_readdir(struct file * filp,
-			     void * dirent, filldir_t filldir)
-{
-	return proc_pident_readdir(filp,dirent,filldir,
-				   tgid_attr_stuff,ARRAY_SIZE(tgid_attr_stuff));
-}
-
-static int proc_tid_attr_readdir(struct file * filp,
+static int proc_tgid_base_readdir(struct file * filp,
 			     void * dirent, filldir_t filldir)
 {
 	return proc_pident_readdir(filp,dirent,filldir,
-				   tid_attr_stuff,ARRAY_SIZE(tid_attr_stuff));
+				   tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff));
 }
 
-static struct file_operations proc_tgid_attr_operations = {
-	.read		= generic_read_dir,
-	.readdir	= proc_tgid_attr_readdir,
-};
-
-static struct file_operations proc_tid_attr_operations = {
+static struct file_operations proc_tgid_base_operations = {
 	.read		= generic_read_dir,
-	.readdir	= proc_tid_attr_readdir,
+	.readdir	= proc_tgid_base_readdir,
 };
 
-static struct dentry *proc_tgid_attr_lookup(struct inode *dir,
-				struct dentry *dentry, struct nameidata *nd)
-{
-	return proc_pident_lookup(dir, dentry, tgid_attr_stuff);
-}
-
-static struct dentry *proc_tid_attr_lookup(struct inode *dir,
-				struct dentry *dentry, struct nameidata *nd)
-{
-	return proc_pident_lookup(dir, dentry, tid_attr_stuff);
+static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
+	return proc_pident_lookup(dir, dentry,
+				  tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
 }
 
-static struct inode_operations proc_tgid_attr_inode_operations = {
-	.lookup		= proc_tgid_attr_lookup,
-	.getattr	= pid_getattr,
-	.setattr	= proc_setattr,
-};
-
-static struct inode_operations proc_tid_attr_inode_operations = {
-	.lookup		= proc_tid_attr_lookup,
+static struct inode_operations proc_tgid_base_inode_operations = {
+	.lookup		= proc_tgid_base_lookup,
 	.getattr	= pid_getattr,
 	.setattr	= proc_setattr,
 };
-#endif
-
-/*
- * /proc/self:
- */
-static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
-			      int buflen)
-{
-	char tmp[PROC_NUMBUF];
-	sprintf(tmp, "%d", current->tgid);
-	return vfs_readlink(dentry,buffer,buflen,tmp);
-}
-
-static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	char tmp[PROC_NUMBUF];
-	sprintf(tmp, "%d", current->tgid);
-	return ERR_PTR(vfs_follow_link(nd,tmp));
-}	
-
-static struct inode_operations proc_self_inode_operations = {
-	.readlink	= proc_self_readlink,
-	.follow_link	= proc_self_follow_link,
-};
 
 /**
  * proc_flush_task -  Remove dcache entries for @task from the /proc dcache.
@@ -2021,54 +1882,23 @@ out:
 	return;
 }
 
-/* SMP-safe */
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+struct dentry *proc_pid_instantiate(struct inode *dir,
+	struct dentry * dentry, struct task_struct *task, void *ptr)
 {
-	struct dentry *result = ERR_PTR(-ENOENT);
-	struct task_struct *task;
+	struct dentry *error = ERR_PTR(-ENOENT);
 	struct inode *inode;
-	struct proc_inode *ei;
-	unsigned tgid;
-
-	if (dentry->d_name.len == 4 && !memcmp(dentry->d_name.name,"self",4)) {
-		inode = new_inode(dir->i_sb);
-		if (!inode)
-			return ERR_PTR(-ENOMEM);
-		ei = PROC_I(inode);
-		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-		inode->i_ino = fake_ino(0, PROC_TGID_INO);
-		ei->pde = NULL;
-		inode->i_mode = S_IFLNK|S_IRWXUGO;
-		inode->i_uid = inode->i_gid = 0;
-		inode->i_size = 64;
-		inode->i_op = &proc_self_inode_operations;
-		d_add(dentry, inode);
-		return NULL;
-	}
-	tgid = name_to_int(dentry);
-	if (tgid == ~0U)
-		goto out;
 
-	rcu_read_lock();
-	task = find_task_by_pid(tgid);
-	if (task)
-		get_task_struct(task);
-	rcu_read_unlock();
-	if (!task)
-		goto out;
-
-	inode = proc_pid_make_inode(dir->i_sb, task, PROC_TGID_INO);
+	inode = proc_pid_make_inode(dir->i_sb, task);
 	if (!inode)
-		goto out_put_task;
+		goto out;
 
 	inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
 	inode->i_op = &proc_tgid_base_inode_operations;
 	inode->i_fop = &proc_tgid_base_operations;
 	inode->i_flags|=S_IMMUTABLE;
-#ifdef CONFIG_SECURITY
-	inode->i_nlink = 5;
-#else
 	inode->i_nlink = 4;
+#ifdef CONFIG_SECURITY
+	inode->i_nlink += 1;
 #endif
 
 	dentry->d_op = &pid_dentry_operations;
@@ -2076,179 +1906,251 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
 	if (pid_revalidate(dentry, NULL))
-		result = NULL;
-
-out_put_task:
-	put_task_struct(task);
+		error = NULL;
 out:
-	return result;
+	return error;
 }
 
-/* SMP-safe */
-static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
 	struct dentry *result = ERR_PTR(-ENOENT);
 	struct task_struct *task;
-	struct task_struct *leader = get_proc_task(dir);
-	struct inode *inode;
-	unsigned tid;
+	unsigned tgid;
 
-	if (!leader)
-		goto out_no_task;
+	result = proc_base_lookup(dir, dentry);
+	if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
+		goto out;
 
-	tid = name_to_int(dentry);
-	if (tid == ~0U)
+	tgid = name_to_int(dentry);
+	if (tgid == ~0U)
 		goto out;
 
 	rcu_read_lock();
-	task = find_task_by_pid(tid);
+	task = find_task_by_pid(tgid);
 	if (task)
 		get_task_struct(task);
 	rcu_read_unlock();
 	if (!task)
 		goto out;
-	if (leader->tgid != task->tgid)
-		goto out_drop_task;
-
-	inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_INO);
-
-
-	if (!inode)
-		goto out_drop_task;
-	inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
-	inode->i_op = &proc_tid_base_inode_operations;
-	inode->i_fop = &proc_tid_base_operations;
-	inode->i_flags|=S_IMMUTABLE;
-#ifdef CONFIG_SECURITY
-	inode->i_nlink = 4;
-#else
-	inode->i_nlink = 3;
-#endif
-
-	dentry->d_op = &pid_dentry_operations;
-
-	d_add(dentry, inode);
-	/* Close the race of the process dying before we return the dentry */
-	if (pid_revalidate(dentry, NULL))
-		result = NULL;
 
-out_drop_task:
+	result = proc_pid_instantiate(dir, dentry, task, NULL);
 	put_task_struct(task);
 out:
-	put_task_struct(leader);
-out_no_task:
 	return result;
 }
 
 /*
- * Find the first tgid to return to user space.
- *
- * Usually this is just whatever follows &init_task, but if the users
- * buffer was too small to hold the full list or there was a seek into
- * the middle of the directory we have more work to do.
- *
- * In the case of a short read we start with find_task_by_pid.
+ * Find the first task with tgid >= tgid
  *
- * In the case of a seek we start with &init_task and walk nr
- * threads past it.
  */
-static struct task_struct *first_tgid(int tgid, unsigned int nr)
+static struct task_struct *next_tgid(unsigned int tgid)
 {
-	struct task_struct *pos;
-	rcu_read_lock();
-	if (tgid && nr) {
-		pos = find_task_by_pid(tgid);
-		if (pos && thread_group_leader(pos))
-			goto found;
-	}
-	/* If nr exceeds the number of processes get out quickly */
-	pos = NULL;
-	if (nr && nr >= nr_processes())
-		goto done;
+	struct task_struct *task;
+	struct pid *pid;
 
-	/* If we haven't found our starting place yet start with
-	 * the init_task and walk nr tasks forward.
-	 */
-	for (pos = next_task(&init_task); nr > 0; --nr) {
-		pos = next_task(pos);
-		if (pos == &init_task) {
-			pos = NULL;
-			goto done;
-		}
+	rcu_read_lock();
+retry:
+	task = NULL;
+	pid = find_ge_pid(tgid);
+	if (pid) {
+		tgid = pid->nr + 1;
+		task = pid_task(pid, PIDTYPE_PID);
+		/* What we to know is if the pid we have find is the
+		 * pid of a thread_group_leader.  Testing for task
+		 * being a thread_group_leader is the obvious thing
+		 * todo but there is a window when it fails, due to
+		 * the pid transfer logic in de_thread.
+		 *
+		 * So we perform the straight forward test of seeing
+		 * if the pid we have found is the pid of a thread
+		 * group leader, and don't worry if the task we have
+		 * found doesn't happen to be a thread group leader.
+		 * As we don't care in the case of readdir.
+		 */
+		if (!task || !has_group_leader_pid(task))
+			goto retry;
+		get_task_struct(task);
 	}
-found:
-	get_task_struct(pos);
-done:
 	rcu_read_unlock();
-	return pos;
+	return task;
 }
 
-/*
- * Find the next task in the task list.
- * Return NULL if we loop or there is any error.
- *
- * The reference to the input task_struct is released.
- */
-static struct task_struct *next_tgid(struct task_struct *start)
+#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff))
+
+static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+	struct task_struct *task, int tgid)
 {
-	struct task_struct *pos;
-	rcu_read_lock();
-	pos = start;
-	if (pid_alive(start))
-		pos = next_task(start);
-	if (pid_alive(pos) && (pos != &init_task)) {
-		get_task_struct(pos);
-		goto done;
-	}
-	pos = NULL;
-done:
-	rcu_read_unlock();
-	put_task_struct(start);
-	return pos;
+	char name[PROC_NUMBUF];
+	int len = snprintf(name, sizeof(name), "%d", tgid);
+	return proc_fill_cache(filp, dirent, filldir, name, len,
+				proc_pid_instantiate, task, NULL);
 }
 
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
-	char buf[PROC_NUMBUF];
 	unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
+	struct task_struct *reaper = get_proc_task(filp->f_dentry->d_inode);
 	struct task_struct *task;
 	int tgid;
 
-	if (!nr) {
-		ino_t ino = fake_ino(0,PROC_TGID_INO);
-		if (filldir(dirent, "self", 4, filp->f_pos, ino, DT_LNK) < 0)
-			return 0;
-		filp->f_pos++;
-		nr++;
+	if (!reaper)
+		goto out_no_task;
+
+	for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
+		struct pid_entry *p = &proc_base_stuff[nr];
+		if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
+			goto out;
 	}
-	nr -= 1;
 
-	/* f_version caches the tgid value that the last readdir call couldn't
-	 * return. lseek aka telldir automagically resets f_version to 0.
-	 */
-	tgid = filp->f_version;
-	filp->f_version = 0;
-	for (task = first_tgid(tgid, nr);
+	tgid = filp->f_pos - TGID_OFFSET;
+	for (task = next_tgid(tgid);
 	     task;
-	     task = next_tgid(task), filp->f_pos++) {
-		int len;
-		ino_t ino;
+	     put_task_struct(task), task = next_tgid(tgid + 1)) {
 		tgid = task->pid;
-		len = snprintf(buf, sizeof(buf), "%d", tgid);
-		ino = fake_ino(tgid, PROC_TGID_INO);
-		if (filldir(dirent, buf, len, filp->f_pos, ino, DT_DIR) < 0) {
-			/* returning this tgid failed, save it as the first
-			 * pid for the next readir call */
-			filp->f_version = tgid;
+		filp->f_pos = tgid + TGID_OFFSET;
+		if (proc_pid_fill_cache(filp, dirent, filldir, task, tgid) < 0) {
 			put_task_struct(task);
-			break;
+			goto out;
 		}
 	}
+	filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
+out:
+	put_task_struct(reaper);
+out_no_task:
 	return 0;
 }
 
 /*
+ * Tasks
+ */
+static struct pid_entry tid_base_stuff[] = {
+	DIR("fd",        S_IRUSR|S_IXUSR, fd),
+	INF("environ",   S_IRUSR, pid_environ),
+	INF("auxv",      S_IRUSR, pid_auxv),
+	INF("status",    S_IRUGO, pid_status),
+	INF("cmdline",   S_IRUGO, pid_cmdline),
+	INF("stat",      S_IRUGO, tid_stat),
+	INF("statm",     S_IRUGO, pid_statm),
+	REG("maps",      S_IRUGO, maps),
+#ifdef CONFIG_NUMA
+	REG("numa_maps", S_IRUGO, numa_maps),
+#endif
+	REG("mem",       S_IRUSR|S_IWUSR, mem),
+#ifdef CONFIG_SECCOMP
+	REG("seccomp",   S_IRUSR|S_IWUSR, seccomp),
+#endif
+	LNK("cwd",       cwd),
+	LNK("root",      root),
+	LNK("exe",       exe),
+	REG("mounts",    S_IRUGO, mounts),
+#ifdef CONFIG_MMU
+	REG("smaps",     S_IRUGO, smaps),
+#endif
+#ifdef CONFIG_SECURITY
+	DIR("attr",      S_IRUGO|S_IXUGO, attr_dir),
+#endif
+#ifdef CONFIG_KALLSYMS
+	INF("wchan",     S_IRUGO, pid_wchan),
+#endif
+#ifdef CONFIG_SCHEDSTATS
+	INF("schedstat", S_IRUGO, pid_schedstat),
+#endif
+#ifdef CONFIG_CPUSETS
+	REG("cpuset",    S_IRUGO, cpuset),
+#endif
+	INF("oom_score", S_IRUGO, oom_score),
+	REG("oom_adj",   S_IRUGO|S_IWUSR, oom_adjust),
+#ifdef CONFIG_AUDITSYSCALL
+	REG("loginuid",  S_IWUSR|S_IRUGO, loginuid),
+#endif
+};
+
+static int proc_tid_base_readdir(struct file * filp,
+			     void * dirent, filldir_t filldir)
+{
+	return proc_pident_readdir(filp,dirent,filldir,
+				   tid_base_stuff,ARRAY_SIZE(tid_base_stuff));
+}
+
+static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
+	return proc_pident_lookup(dir, dentry,
+				  tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
+}
+
+static struct file_operations proc_tid_base_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_tid_base_readdir,
+};
+
+static struct inode_operations proc_tid_base_inode_operations = {
+	.lookup		= proc_tid_base_lookup,
+	.getattr	= pid_getattr,
+	.setattr	= proc_setattr,
+};
+
+static struct dentry *proc_task_instantiate(struct inode *dir,
+	struct dentry *dentry, struct task_struct *task, void *ptr)
+{
+	struct dentry *error = ERR_PTR(-ENOENT);
+	struct inode *inode;
+	inode = proc_pid_make_inode(dir->i_sb, task);
+
+	if (!inode)
+		goto out;
+	inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
+	inode->i_op = &proc_tid_base_inode_operations;
+	inode->i_fop = &proc_tid_base_operations;
+	inode->i_flags|=S_IMMUTABLE;
+	inode->i_nlink = 3;
+#ifdef CONFIG_SECURITY
+	inode->i_nlink += 1;
+#endif
+
+	dentry->d_op = &pid_dentry_operations;
+
+	d_add(dentry, inode);
+	/* Close the race of the process dying before we return the dentry */
+	if (pid_revalidate(dentry, NULL))
+		error = NULL;
+out:
+	return error;
+}
+
+static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
+{
+	struct dentry *result = ERR_PTR(-ENOENT);
+	struct task_struct *task;
+	struct task_struct *leader = get_proc_task(dir);
+	unsigned tid;
+
+	if (!leader)
+		goto out_no_task;
+
+	tid = name_to_int(dentry);
+	if (tid == ~0U)
+		goto out;
+
+	rcu_read_lock();
+	task = find_task_by_pid(tid);
+	if (task)
+		get_task_struct(task);
+	rcu_read_unlock();
+	if (!task)
+		goto out;
+	if (leader->tgid != task->tgid)
+		goto out_drop_task;
+
+	result = proc_task_instantiate(dir, dentry, task, NULL);
+out_drop_task:
+	put_task_struct(task);
+out:
+	put_task_struct(leader);
+out_no_task:
+	return result;
+}
+
+/*
  * Find the first tid of a thread group to return to user space.
  *
  * Usually this is just the thread group leader, but if the users
@@ -2317,10 +2219,18 @@ static struct task_struct *next_tid(struct task_struct *start)
 	return pos;
 }
 
+static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+	struct task_struct *task, int tid)
+{
+	char name[PROC_NUMBUF];
+	int len = snprintf(name, sizeof(name), "%d", tid);
+	return proc_fill_cache(filp, dirent, filldir, name, len,
+				proc_task_instantiate, task, NULL);
+}
+
 /* for the /proc/TGID/task/ directories */
 static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
-	char buf[PROC_NUMBUF];
 	struct dentry *dentry = filp->f_dentry;
 	struct inode *inode = dentry->d_inode;
 	struct task_struct *leader = get_proc_task(inode);
@@ -2357,11 +2267,8 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
 	for (task = first_tid(leader, tid, pos - 2);
 	     task;
 	     task = next_tid(task), pos++) {
-		int len;
 		tid = task->pid;
-		len = snprintf(buf, sizeof(buf), "%d", tid);
-		ino = fake_ino(tid, PROC_TID_INO);
-		if (filldir(dirent, buf, len, pos, ino, DT_DIR < 0)) {
+		if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) {
 			/* returning this tgid failed, save it as the first
 			 * pid for the next readir call */
 			filp->f_version = tid;
@@ -2391,3 +2298,14 @@ static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
 
 	return 0;
 }
+
+static struct inode_operations proc_task_inode_operations = {
+	.lookup		= proc_task_lookup,
+	.getattr	= proc_task_getattr,
+	.setattr	= proc_setattr,
+};
+
+static struct file_operations proc_task_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_task_readdir,
+};
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 146a434ba944..987c773dbb20 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -28,6 +28,7 @@ do {						\
 	(vmi)->largest_chunk = 0;		\
 } while(0)
 
+extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *);
 #endif
 
 extern void create_seq_entry(char *name, mode_t mode, const struct file_operations *f);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6a984f64edd7..1294eda4acae 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -100,7 +100,7 @@ static int notesize(struct memelfnote *en)
 	int sz;
 
 	sz = sizeof(struct elf_note);
-	sz += roundup(strlen(en->name), 4);
+	sz += roundup((strlen(en->name) + 1), 4);
 	sz += roundup(en->datasz, 4);
 
 	return sz;
@@ -116,7 +116,7 @@ static char *storenote(struct memelfnote *men, char *bufp)
 
 #define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0)
 
-	en.n_namesz = strlen(men->name);
+	en.n_namesz = strlen(men->name) + 1;
 	en.n_descsz = men->datasz;
 	en.n_type = men->type;
 
@@ -279,12 +279,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 		tsz = elf_buflen - *fpos;
 		if (buflen < tsz)
 			tsz = buflen;
-		elf_buf = kmalloc(elf_buflen, GFP_ATOMIC);
+		elf_buf = kzalloc(elf_buflen, GFP_ATOMIC);
 		if (!elf_buf) {
 			read_unlock(&kclist_lock);
 			return -ENOMEM;
 		}
-		memset(elf_buf, 0, elf_buflen);
 		elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
 		read_unlock(&kclist_lock);
 		if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
@@ -330,10 +329,9 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
 			unsigned long curstart = start;
 			unsigned long cursize = tsz;
 
-			elf_buf = kmalloc(tsz, GFP_KERNEL);
+			elf_buf = kzalloc(tsz, GFP_KERNEL);
 			if (!elf_buf)
 				return -ENOMEM;
-			memset(elf_buf, 0, tsz);
 
 			read_lock(&vmlist_lock);
 			for (m=vmlist; m && cursize; m=m->next) {
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index cff10ab1af63..d7dbdf9e0f49 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -33,19 +33,15 @@
 #include "internal.h"
 
 /*
- * display a list of all the VMAs the kernel knows about
- * - nommu kernals have a single flat list
+ * display a single VMA to a sequenced file
  */
-static int nommu_vma_list_show(struct seq_file *m, void *v)
+int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 {
-	struct vm_area_struct *vma;
 	unsigned long ino = 0;
 	struct file *file;
 	dev_t dev = 0;
 	int flags, len;
 
-	vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb);
-
 	flags = vma->vm_flags;
 	file = vma->vm_file;
 
@@ -78,6 +74,18 @@ static int nommu_vma_list_show(struct seq_file *m, void *v)
 	return 0;
 }
 
+/*
+ * display a list of all the VMAs the kernel knows about
+ * - nommu kernals have a single flat list
+ */
+static int nommu_vma_list_show(struct seq_file *m, void *v)
+{
+	struct vm_area_struct *vma;
+
+	vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb);
+	return nommu_vma_show(m, vma);
+}
+
 static void *nommu_vma_list_start(struct seq_file *m, loff_t *_pos)
 {
 	struct rb_node *_rb;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 5bbd60896050..93c43b676e59 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -45,6 +45,7 @@
 #include <linux/sysrq.h>
 #include <linux/vmalloc.h>
 #include <linux/crash_dump.h>
+#include <linux/pspace.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/io.h>
@@ -91,7 +92,7 @@ static int loadavg_read_proc(char *page, char **start, off_t off,
 		LOAD_INT(a), LOAD_FRAC(a),
 		LOAD_INT(b), LOAD_FRAC(b),
 		LOAD_INT(c), LOAD_FRAC(c),
-		nr_running(), nr_threads, last_pid);
+		nr_running(), nr_threads, init_pspace.last_pid);
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
@@ -277,12 +278,15 @@ static int devinfo_show(struct seq_file *f, void *v)
 		if (i == 0)
 			seq_printf(f, "Character devices:\n");
 		chrdev_show(f, i);
-	} else {
+	}
+#ifdef CONFIG_BLOCK
+	else {
 		i -= CHRDEV_MAJOR_HASH_SIZE;
 		if (i == 0)
 			seq_printf(f, "\nBlock devices:\n");
 		blkdev_show(f, i);
 	}
+#endif
 	return 0;
 }
 
@@ -355,6 +359,7 @@ static int stram_read_proc(char *page, char **start, off_t off,
 }
 #endif
 
+#ifdef CONFIG_BLOCK
 extern struct seq_operations partitions_op;
 static int partitions_open(struct inode *inode, struct file *file)
 {
@@ -378,6 +383,7 @@ static struct file_operations proc_diskstats_operations = {
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 };
+#endif
 
 #ifdef CONFIG_MODULES
 extern struct seq_operations modules_op;
@@ -641,7 +647,7 @@ static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf,
 
 		if (get_user(c, buf))
 			return -EFAULT;
-		__handle_sysrq(c, NULL, NULL, 0);
+		__handle_sysrq(c, NULL, 0);
 	}
 	return count;
 }
@@ -695,7 +701,9 @@ void __init proc_misc_init(void)
 		entry->proc_fops = &proc_kmsg_operations;
 	create_seq_entry("devices", 0, &proc_devinfo_operations);
 	create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations);
+#ifdef CONFIG_BLOCK
 	create_seq_entry("partitions", 0, &proc_partitions_operations);
+#endif
 	create_seq_entry("stat", 0, &proc_stat_operations);
 	create_seq_entry("interrupts", 0, &proc_interrupts_operations);
 #ifdef CONFIG_SLAB
@@ -707,7 +715,9 @@ void __init proc_misc_init(void)
 	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
 	create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
 	create_seq_entry("zoneinfo",S_IRUGO, &proc_zoneinfo_file_operations);
+#ifdef CONFIG_BLOCK
 	create_seq_entry("diskstats", 0, &proc_diskstats_operations);
+#endif
 #ifdef CONFIG_MODULES
 	create_seq_entry("modules", 0, &proc_modules_operations);
 #endif
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 8901c65caca8..ffe66c38488b 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/bitops.h>
 #include <linux/smp_lock.h>
+#include <linux/mount.h>
 
 #include "internal.h"
 
@@ -28,6 +29,17 @@ struct proc_dir_entry *proc_sys_root;
 static int proc_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
+	if (proc_mnt) {
+		/* Seed the root directory with a pid so it doesn't need
+		 * to be special in base.c.  I would do this earlier but
+		 * the only task alive when /proc is mounted the first time
+		 * is the init_task and it doesn't have any pids.
+		 */
+		struct proc_inode *ei;
+		ei = PROC_I(proc_mnt->mnt_sb->s_root->d_inode);
+		if (!ei->pid)
+			ei->pid = find_get_pid(1);
+	}
 	return get_sb_single(fs_type, flags, data, proc_fill_super, mnt);
 }
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 0a163a4f7764..6b769afac55a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -122,11 +122,6 @@ struct mem_size_stats
 	unsigned long private_dirty;
 };
 
-__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
-{
-	return NULL;
-}
-
 static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
 {
 	struct proc_maps_private *priv = m->private;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 4616ed50ffcd..091aa8e48e02 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -138,25 +138,63 @@ out:
 }
 
 /*
- * Albert D. Cahalan suggested to fake entries for the traditional
- * sections here.  This might be worth investigating.
+ * display mapping lines for a particular process's /proc/pid/maps
  */
-static int show_map(struct seq_file *m, void *v)
+static int show_map(struct seq_file *m, void *_vml)
 {
-	return 0;
+	struct vm_list_struct *vml = _vml;
+	return nommu_vma_show(m, vml->vma);
 }
+
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
+	struct proc_maps_private *priv = m->private;
+	struct vm_list_struct *vml;
+	struct mm_struct *mm;
+	loff_t n = *pos;
+
+	/* pin the task and mm whilst we play with them */
+	priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
+	if (!priv->task)
+		return NULL;
+
+	mm = get_task_mm(priv->task);
+	if (!mm) {
+		put_task_struct(priv->task);
+		priv->task = NULL;
+		return NULL;
+	}
+
+	down_read(&mm->mmap_sem);
+
+	/* start from the Nth VMA */
+	for (vml = mm->context.vmlist; vml; vml = vml->next)
+		if (n-- == 0)
+			return vml;
 	return NULL;
 }
-static void m_stop(struct seq_file *m, void *v)
+
+static void m_stop(struct seq_file *m, void *_vml)
 {
+	struct proc_maps_private *priv = m->private;
+
+	if (priv->task) {
+		struct mm_struct *mm = priv->task->mm;
+		up_read(&mm->mmap_sem);
+		mmput(mm);
+		put_task_struct(priv->task);
+	}
 }
-static void *m_next(struct seq_file *m, void *v, loff_t *pos)
+
+static void *m_next(struct seq_file *m, void *_vml, loff_t *pos)
 {
-	return NULL;
+	struct vm_list_struct *vml = _vml;
+
+	(*pos)++;
+	return vml ? vml->next : NULL;
 }
-static struct seq_operations proc_pid_maps_op = {
+
+static struct seq_operations proc_pid_maps_ops = {
 	.start	= m_start,
 	.next	= m_next,
 	.stop	= m_stop,
@@ -165,11 +203,19 @@ static struct seq_operations proc_pid_maps_op = {
 
 static int maps_open(struct inode *inode, struct file *file)
 {
-	int ret;
-	ret = seq_open(file, &proc_pid_maps_op);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = NULL;
+	struct proc_maps_private *priv;
+	int ret = -ENOMEM;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv) {
+		priv->pid = proc_pid(inode);
+		ret = seq_open(file, &proc_pid_maps_ops);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = priv;
+		} else {
+			kfree(priv);
+		}
 	}
 	return ret;
 }
@@ -178,6 +224,6 @@ struct file_operations proc_maps_operations = {
 	.open		= maps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= seq_release_private,
 };
 
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
index 62af4b1348bd..467e5ac7280e 100644
--- a/fs/qnx4/file.c
+++ b/fs/qnx4/file.c
@@ -22,11 +22,13 @@
 const struct file_operations qnx4_file_operations =
 {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
 	.mmap		= generic_file_mmap,
 	.sendfile	= generic_file_sendfile,
 #ifdef CONFIG_QNX4FS_RW
-	.write		= generic_file_write,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.fsync		= qnx4_sync_file,
 #endif
 };
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 5a903491e697..5a41db2a218d 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -358,11 +358,10 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 	const char *errmsg;
 	struct qnx4_sb_info *qs;
 
-	qs = kmalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL);
+	qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL);
 	if (!qs)
 		return -ENOMEM;
 	s->s_fs_info = qs;
-	memset(qs, 0, sizeof(struct qnx4_sb_info));
 
 	sb_set_blocksize(s, QNX4_BLOCK_SIZE);
 
@@ -497,7 +496,6 @@ static void qnx4_read_inode(struct inode *inode)
 	inode->i_ctime.tv_sec   = le32_to_cpu(raw_inode->di_ctime);
 	inode->i_ctime.tv_nsec = 0;
 	inode->i_blocks  = le32_to_cpu(raw_inode->di_first_xtnt.xtnt_size);
-	inode->i_blksize = QNX4_DIR_ENTRY_SIZE;
 
 	memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE);
 	if (S_ISREG(inode->i_mode)) {
@@ -557,9 +555,7 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(qnx4_inode_cachep))
-		printk(KERN_INFO
-		       "qnx4_inode_cache: not all structures were freed\n");
+	kmem_cache_destroy(qnx4_inode_cachep);
 }
 
 static int qnx4_get_sb(struct file_system_type *fs_type,
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index c3d83f67154a..733cdf01d645 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -186,11 +186,10 @@ int qnx4_rmdir(struct inode *dir, struct dentry *dentry)
 	memset(de->di_fname, 0, sizeof de->di_fname);
 	de->di_mode = 0;
 	mark_buffer_dirty(bh);
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	mark_inode_dirty(inode);
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
-	dir->i_nlink--;
-	mark_inode_dirty(dir);
+	inode_dec_link_count(dir);
 	retval = 0;
 
       end_rmdir:
@@ -234,9 +233,8 @@ int qnx4_unlink(struct inode *dir, struct dentry *dentry)
 	mark_buffer_dirty(bh);
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
 	mark_inode_dirty(dir);
-	inode->i_nlink--;
 	inode->i_ctime = dir->i_ctime;
-	mark_inode_dirty(inode);
+	inode_dec_link_count(inode);
 	retval = 0;
 
 end_unlink:
diff --git a/fs/quota.c b/fs/quota.c
index d6a2be826e29..b9dae76a0b6e 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -338,6 +338,34 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
 }
 
 /*
+ * look up a superblock on which quota ops will be performed
+ * - use the name of a block device to find the superblock thereon
+ */
+static inline struct super_block *quotactl_block(const char __user *special)
+{
+#ifdef CONFIG_BLOCK
+	struct block_device *bdev;
+	struct super_block *sb;
+	char *tmp = getname(special);
+
+	if (IS_ERR(tmp))
+		return ERR_PTR(PTR_ERR(tmp));
+	bdev = lookup_bdev(tmp);
+	putname(tmp);
+	if (IS_ERR(bdev))
+		return ERR_PTR(PTR_ERR(bdev));
+	sb = get_super(bdev);
+	bdput(bdev);
+	if (!sb)
+		return ERR_PTR(-ENODEV);
+
+	return sb;
+#else
+	return ERR_PTR(-ENODEV);
+#endif
+}
+
+/*
  * This is the system call interface. This communicates with
  * the user-level programs. Currently this only supports diskquota
  * calls. Maybe we need to add the process quotas etc. in the future,
@@ -347,25 +375,15 @@ asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t
 {
 	uint cmds, type;
 	struct super_block *sb = NULL;
-	struct block_device *bdev;
-	char *tmp;
 	int ret;
 
 	cmds = cmd >> SUBCMDSHIFT;
 	type = cmd & SUBCMDMASK;
 
 	if (cmds != Q_SYNC || special) {
-		tmp = getname(special);
-		if (IS_ERR(tmp))
-			return PTR_ERR(tmp);
-		bdev = lookup_bdev(tmp);
-		putname(tmp);
-		if (IS_ERR(bdev))
-			return PTR_ERR(bdev);
-		sb = get_super(bdev);
-		bdput(bdev);
-		if (!sb)
-			return -ENODEV;
+		sb = quotactl_block(special);
+		if (IS_ERR(sb))
+			return PTR_ERR(sb);
 	}
 
 	ret = check_quotactl_valid(sb, type, cmds, id);
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 86f14cacf641..0947fb57dcf3 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -33,8 +33,10 @@ const struct address_space_operations ramfs_aops = {
 };
 
 const struct file_operations ramfs_file_operations = {
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.fsync		= simple_sync_file,
 	.sendfile	= generic_file_sendfile,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 677139b48e00..bfe5dbf1002e 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -36,8 +36,10 @@ const struct address_space_operations ramfs_aops = {
 const struct file_operations ramfs_file_operations = {
 	.mmap			= ramfs_nommu_mmap,
 	.get_unmapped_area	= ramfs_nommu_get_unmapped_area,
-	.read			= generic_file_read,
-	.write			= generic_file_write,
+	.read			= do_sync_read,
+	.aio_read		= generic_file_aio_read,
+	.write			= do_sync_write,
+	.aio_write		= generic_file_aio_write,
 	.fsync			= simple_sync_file,
 	.sendfile		= generic_file_sendfile,
 	.llseek			= generic_file_llseek,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index b9677335cc8d..2faf4cdf61b0 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -58,7 +58,6 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 		inode->i_mode = mode;
 		inode->i_uid = current->fsuid;
 		inode->i_gid = current->fsgid;
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_mapping->a_ops = &ramfs_aops;
 		inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
@@ -76,7 +75,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 			inode->i_fop = &simple_dir_operations;
 
 			/* directory inodes start off with i_nlink == 2 (for "." entry) */
-			inode->i_nlink++;
+			inc_nlink(inode);
 			break;
 		case S_IFLNK:
 			inode->i_op = &page_symlink_inode_operations;
@@ -114,7 +113,7 @@ static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 {
 	int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0);
 	if (!retval)
-		dir->i_nlink++;
+		inc_nlink(dir);
 	return retval;
 }
 
diff --git a/fs/read_write.c b/fs/read_write.c
index d4cb3183c99c..f792000a28e6 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -15,13 +15,15 @@
 #include <linux/module.h>
 #include <linux/syscalls.h>
 #include <linux/pagemap.h>
+#include "read_write.h"
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
 const struct file_operations generic_ro_fops = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
 	.mmap		= generic_file_readonly_mmap,
 	.sendfile	= generic_file_sendfile,
 };
@@ -227,14 +229,20 @@ static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
 
 ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 {
+	struct iovec iov = { .iov_base = buf, .iov_len = len };
 	struct kiocb kiocb;
 	ssize_t ret;
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
-	while (-EIOCBRETRY ==
-		(ret = filp->f_op->aio_read(&kiocb, buf, len, kiocb.ki_pos)))
+	kiocb.ki_left = len;
+
+	for (;;) {
+		ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
+		if (ret != -EIOCBRETRY)
+			break;
 		wait_on_retry_sync_kiocb(&kiocb);
+	}
 
 	if (-EIOCBQUEUED == ret)
 		ret = wait_on_sync_kiocb(&kiocb);
@@ -279,14 +287,20 @@ EXPORT_SYMBOL(vfs_read);
 
 ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
 {
+	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
 	struct kiocb kiocb;
 	ssize_t ret;
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
-	while (-EIOCBRETRY ==
-	       (ret = filp->f_op->aio_write(&kiocb, buf, len, kiocb.ki_pos)))
+	kiocb.ki_left = len;
+
+	for (;;) {
+		ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
+		if (ret != -EIOCBRETRY)
+			break;
 		wait_on_retry_sync_kiocb(&kiocb);
+	}
 
 	if (-EIOCBQUEUED == ret)
 		ret = wait_on_sync_kiocb(&kiocb);
@@ -438,78 +452,155 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
 
 EXPORT_UNUSED_SYMBOL(iov_shorten);  /*  June 2006  */
 
+ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
+		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
+{
+	struct kiocb kiocb;
+	ssize_t ret;
+
+	init_sync_kiocb(&kiocb, filp);
+	kiocb.ki_pos = *ppos;
+	kiocb.ki_left = len;
+	kiocb.ki_nbytes = len;
+
+	for (;;) {
+		ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
+		if (ret != -EIOCBRETRY)
+			break;
+		wait_on_retry_sync_kiocb(&kiocb);
+	}
+
+	if (ret == -EIOCBQUEUED)
+		ret = wait_on_sync_kiocb(&kiocb);
+	*ppos = kiocb.ki_pos;
+	return ret;
+}
+
+/* Do it by hand, with file-ops */
+ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
+		unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
+{
+	struct iovec *vector = iov;
+	ssize_t ret = 0;
+
+	while (nr_segs > 0) {
+		void __user *base;
+		size_t len;
+		ssize_t nr;
+
+		base = vector->iov_base;
+		len = vector->iov_len;
+		vector++;
+		nr_segs--;
+
+		nr = fn(filp, base, len, ppos);
+
+		if (nr < 0) {
+			if (!ret)
+				ret = nr;
+			break;
+		}
+		ret += nr;
+		if (nr != len)
+			break;
+	}
+
+	return ret;
+}
+
 /* A write operation does a read from user space and vice versa */
 #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
 
+ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
+			      unsigned long nr_segs, unsigned long fast_segs,
+			      struct iovec *fast_pointer,
+			      struct iovec **ret_pointer)
+  {
+	unsigned long seg;
+  	ssize_t ret;
+	struct iovec *iov = fast_pointer;
+
+  	/*
+  	 * SuS says "The readv() function *may* fail if the iovcnt argument
+  	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
+  	 * traditionally returned zero for zero segments, so...
+  	 */
+	if (nr_segs == 0) {
+		ret = 0;
+  		goto out;
+	}
+
+  	/*
+  	 * First get the "struct iovec" from user memory and
+  	 * verify all the pointers
+  	 */
+	if (nr_segs > UIO_MAXIOV) {
+		ret = -EINVAL;
+  		goto out;
+	}
+	if (nr_segs > fast_segs) {
+  		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+		if (iov == NULL) {
+			ret = -ENOMEM;
+  			goto out;
+		}
+  	}
+	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
+		ret = -EFAULT;
+  		goto out;
+	}
+
+  	/*
+	 * According to the Single Unix Specification we should return EINVAL
+	 * if an element length is < 0 when cast to ssize_t or if the
+	 * total length would overflow the ssize_t return value of the
+	 * system call.
+  	 */
+	ret = 0;
+  	for (seg = 0; seg < nr_segs; seg++) {
+  		void __user *buf = iov[seg].iov_base;
+  		ssize_t len = (ssize_t)iov[seg].iov_len;
+
+		/* see if we we're about to use an invalid len or if
+		 * it's about to overflow ssize_t */
+		if (len < 0 || (ret + len < ret)) {
+			ret = -EINVAL;
+  			goto out;
+		}
+		if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
+			ret = -EFAULT;
+  			goto out;
+		}
+
+		ret += len;
+  	}
+out:
+	*ret_pointer = iov;
+	return ret;
+}
+
 static ssize_t do_readv_writev(int type, struct file *file,
 			       const struct iovec __user * uvector,
 			       unsigned long nr_segs, loff_t *pos)
 {
-	typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
-	typedef ssize_t (*iov_fn_t)(struct file *, const struct iovec *, unsigned long, loff_t *);
-
 	size_t tot_len;
 	struct iovec iovstack[UIO_FASTIOV];
-	struct iovec *iov=iovstack, *vector;
+	struct iovec *iov = iovstack;
 	ssize_t ret;
-	int seg;
 	io_fn_t fn;
 	iov_fn_t fnv;
 
-	/*
-	 * SuS says "The readv() function *may* fail if the iovcnt argument
-	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
-	 * traditionally returned zero for zero segments, so...
-	 */
-	ret = 0;
-	if (nr_segs == 0)
+	if (!file->f_op) {
+		ret = -EINVAL;
 		goto out;
+	}
 
-	/*
-	 * First get the "struct iovec" from user memory and
-	 * verify all the pointers
-	 */
-	ret = -EINVAL;
-	if (nr_segs > UIO_MAXIOV)
-		goto out;
-	if (!file->f_op)
-		goto out;
-	if (nr_segs > UIO_FASTIOV) {
-		ret = -ENOMEM;
-		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
-		if (!iov)
-			goto out;
-	}
-	ret = -EFAULT;
-	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector)))
+	ret = rw_copy_check_uvector(type, uvector, nr_segs,
+			ARRAY_SIZE(iovstack), iovstack, &iov);
+	if (ret <= 0)
 		goto out;
 
-	/*
-	 * Single unix specification:
-	 * We should -EINVAL if an element length is not >= 0 and fitting an
-	 * ssize_t.  The total length is fitting an ssize_t
-	 *
-	 * Be careful here because iov_len is a size_t not an ssize_t
-	 */
-	tot_len = 0;
-	ret = -EINVAL;
-	for (seg = 0; seg < nr_segs; seg++) {
-		void __user *buf = iov[seg].iov_base;
-		ssize_t len = (ssize_t)iov[seg].iov_len;
-
-		if (len < 0)	/* size_t not fitting an ssize_t .. */
-			goto out;
-		if (unlikely(!access_ok(vrfy_dir(type), buf, len)))
-			goto Efault;
-		tot_len += len;
-		if ((ssize_t)tot_len < 0) /* maths overflow on the ssize_t */
-			goto out;
-	}
-	if (tot_len == 0) {
-		ret = 0;
-		goto out;
-	}
-
+	tot_len = ret;
 	ret = rw_verify_area(type, file, pos, tot_len);
 	if (ret < 0)
 		goto out;
@@ -520,39 +611,18 @@ static ssize_t do_readv_writev(int type, struct file *file,
 	fnv = NULL;
 	if (type == READ) {
 		fn = file->f_op->read;
-		fnv = file->f_op->readv;
+		fnv = file->f_op->aio_read;
 	} else {
 		fn = (io_fn_t)file->f_op->write;
-		fnv = file->f_op->writev;
+		fnv = file->f_op->aio_write;
 	}
-	if (fnv) {
-		ret = fnv(file, iov, nr_segs, pos);
-		goto out;
-	}
-
-	/* Do it by hand, with file-ops */
-	ret = 0;
-	vector = iov;
-	while (nr_segs > 0) {
-		void __user * base;
-		size_t len;
-		ssize_t nr;
 
-		base = vector->iov_base;
-		len = vector->iov_len;
-		vector++;
-		nr_segs--;
-
-		nr = fn(file, base, len, pos);
+	if (fnv)
+		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
+						pos, fnv);
+	else
+		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
 
-		if (nr < 0) {
-			if (!ret) ret = nr;
-			break;
-		}
-		ret += nr;
-		if (nr != len)
-			break;
-	}
 out:
 	if (iov != iovstack)
 		kfree(iov);
@@ -563,9 +633,6 @@ out:
 			fsnotify_modify(file->f_dentry);
 	}
 	return ret;
-Efault:
-	ret = -EFAULT;
-	goto out;
 }
 
 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
@@ -573,7 +640,7 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
 {
 	if (!(file->f_mode & FMODE_READ))
 		return -EBADF;
-	if (!file->f_op || (!file->f_op->readv && !file->f_op->read))
+	if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
 		return -EINVAL;
 
 	return do_readv_writev(READ, file, vec, vlen, pos);
@@ -586,7 +653,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 {
 	if (!(file->f_mode & FMODE_WRITE))
 		return -EBADF;
-	if (!file->f_op || (!file->f_op->writev && !file->f_op->write))
+	if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
 		return -EINVAL;
 
 	return do_readv_writev(WRITE, file, vec, vlen, pos);
diff --git a/fs/read_write.h b/fs/read_write.h
new file mode 100644
index 000000000000..d07b954c6e0c
--- /dev/null
+++ b/fs/read_write.h
@@ -0,0 +1,14 @@
+/*
+ * This file is only for sharing some helpers from read_write.c with compat.c.
+ * Don't use anywhere else.
+ */
+
+
+typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
+typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
+		unsigned long, loff_t);
+
+ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
+		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn);
+ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
+		unsigned long nr_segs, loff_t *ppos, io_fn_t fn);
diff --git a/fs/readdir.c b/fs/readdir.c
index b6109329b607..bff3ee58e2f8 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -69,20 +69,24 @@ struct readdir_callback {
 };
 
 static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset,
-		      ino_t ino, unsigned int d_type)
+		      u64 ino, unsigned int d_type)
 {
 	struct readdir_callback * buf = (struct readdir_callback *) __buf;
 	struct old_linux_dirent __user * dirent;
+	unsigned long d_ino;
 
 	if (buf->result)
 		return -EINVAL;
+	d_ino = ino;
+	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+		return -EOVERFLOW;
 	buf->result++;
 	dirent = buf->dirent;
 	if (!access_ok(VERIFY_WRITE, dirent,
 			(unsigned long)(dirent->d_name + namlen + 1) -
 				(unsigned long)dirent))
 		goto efault;
-	if (	__put_user(ino, &dirent->d_ino) ||
+	if (	__put_user(d_ino, &dirent->d_ino) ||
 		__put_user(offset, &dirent->d_offset) ||
 		__put_user(namlen, &dirent->d_namlen) ||
 		__copy_to_user(dirent->d_name, name, namlen) ||
@@ -138,22 +142,26 @@ struct getdents_callback {
 };
 
 static int filldir(void * __buf, const char * name, int namlen, loff_t offset,
-		   ino_t ino, unsigned int d_type)
+		   u64 ino, unsigned int d_type)
 {
 	struct linux_dirent __user * dirent;
 	struct getdents_callback * buf = (struct getdents_callback *) __buf;
+	unsigned long d_ino;
 	int reclen = ROUND_UP(NAME_OFFSET(dirent) + namlen + 2);
 
 	buf->error = -EINVAL;	/* only used if we fail.. */
 	if (reclen > buf->count)
 		return -EINVAL;
+	d_ino = ino;
+	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+		return -EOVERFLOW;
 	dirent = buf->previous;
 	if (dirent) {
 		if (__put_user(offset, &dirent->d_off))
 			goto efault;
 	}
 	dirent = buf->current_dir;
-	if (__put_user(ino, &dirent->d_ino))
+	if (__put_user(d_ino, &dirent->d_ino))
 		goto efault;
 	if (__put_user(reclen, &dirent->d_reclen))
 		goto efault;
@@ -222,7 +230,7 @@ struct getdents_callback64 {
 };
 
 static int filldir64(void * __buf, const char * name, int namlen, loff_t offset,
-		     ino_t ino, unsigned int d_type)
+		     u64 ino, unsigned int d_type)
 {
 	struct linux_dirent64 __user *dirent;
 	struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf;
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 3a59309f3ca9..0eb7ac080484 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -28,7 +28,7 @@ endif
 # will work around it. If any other architecture displays this behavior,
 # add it here.
 ifeq ($(CONFIG_PPC32),y)
-EXTRA_CFLAGS := -O1
+EXTRA_CFLAGS := $(call cc-ifversion, -lt, 0400, -O1)
 endif
 
 TAGS:
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 4a7dbdee1b6d..e3d466a228d4 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -9,6 +9,7 @@
 #include <linux/buffer_head.h>
 #include <linux/kernel.h>
 #include <linux/pagemap.h>
+#include <linux/vmalloc.h>
 #include <linux/reiserfs_fs_sb.h>
 #include <linux/reiserfs_fs_i.h>
 #include <linux/quotaops.h>
@@ -50,16 +51,15 @@ static inline void get_bit_address(struct super_block *s,
 {
 	/* It is in the bitmap block number equal to the block
 	 * number divided by the number of bits in a block. */
-	*bmap_nr = block / (s->s_blocksize << 3);
+	*bmap_nr = block >> (s->s_blocksize_bits + 3);
 	/* Within that bitmap block it is located at bit offset *offset. */
 	*offset = block & ((s->s_blocksize << 3) - 1);
-	return;
 }
 
 #ifdef CONFIG_REISERFS_CHECK
 int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
 {
-	int i, j;
+	int bmap, offset;
 
 	if (block == 0 || block >= SB_BLOCK_COUNT(s)) {
 		reiserfs_warning(s,
@@ -68,36 +68,32 @@ int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
 		return 0;
 	}
 
-	/* it can't be one of the bitmap blocks */
-	for (i = 0; i < SB_BMAP_NR(s); i++)
-		if (block == SB_AP_BITMAP(s)[i].bh->b_blocknr) {
+	get_bit_address(s, block, &bmap, &offset);
+
+	/* Old format filesystem? Unlikely, but the bitmaps are all up front so
+	 * we need to account for it. */
+	if (unlikely(test_bit(REISERFS_OLD_FORMAT,
+			      &(REISERFS_SB(s)->s_properties)))) {
+		b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1;
+		if (block >= bmap1 && block <= bmap1 + SB_BMAP_NR(s)) {
+			reiserfs_warning(s, "vs: 4019: is_reusable: "
+					 "bitmap block %lu(%u) can't be freed or reused",
+					 block, SB_BMAP_NR(s));
+			return 0;
+		}
+	} else {
+		if (offset == 0) {
 			reiserfs_warning(s, "vs: 4020: is_reusable: "
 					 "bitmap block %lu(%u) can't be freed or reused",
 					 block, SB_BMAP_NR(s));
 			return 0;
 		}
-
-	get_bit_address(s, block, &i, &j);
-
-	if (i >= SB_BMAP_NR(s)) {
-		reiserfs_warning(s,
-				 "vs-4030: is_reusable: there is no so many bitmap blocks: "
-				 "block=%lu, bitmap_nr=%d", block, i);
-		return 0;
 	}
 
-	if ((bit_value == 0 &&
-	     reiserfs_test_le_bit(j, SB_AP_BITMAP(s)[i].bh->b_data)) ||
-	    (bit_value == 1 &&
-	     reiserfs_test_le_bit(j, SB_AP_BITMAP(s)[i].bh->b_data) == 0)) {
+	if (bmap >= SB_BMAP_NR(s)) {
 		reiserfs_warning(s,
-				 "vs-4040: is_reusable: corresponding bit of block %lu does not "
-				 "match required value (i==%d, j==%d) test_bit==%d",
-				 block, i, j, reiserfs_test_le_bit(j,
-								   SB_AP_BITMAP
-								   (s)[i].bh->
-								   b_data));
-
+				 "vs-4030: is_reusable: there is no so many bitmap blocks: "
+				 "block=%lu, bitmap_nr=%d", block, bmap);
 		return 0;
 	}
 
@@ -141,6 +137,7 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
 {
 	struct super_block *s = th->t_super;
 	struct reiserfs_bitmap_info *bi = &SB_AP_BITMAP(s)[bmap_n];
+	struct buffer_head *bh;
 	int end, next;
 	int org = *beg;
 
@@ -159,22 +156,25 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
 				 bmap_n);
 		return 0;
 	}
-	if (buffer_locked(bi->bh)) {
-		PROC_INFO_INC(s, scan_bitmap.wait);
-		__wait_on_buffer(bi->bh);
-	}
+
+	bh = reiserfs_read_bitmap_block(s, bmap_n);
+	if (bh == NULL)
+		return 0;
 
 	while (1) {
 	      cont:
-		if (bi->free_count < min)
+		if (bi->free_count < min) {
+			brelse(bh);
 			return 0;	// No free blocks in this bitmap
+		}
 
 		/* search for a first zero bit -- beggining of a window */
 		*beg = reiserfs_find_next_zero_le_bit
-		    ((unsigned long *)(bi->bh->b_data), boundary, *beg);
+		    ((unsigned long *)(bh->b_data), boundary, *beg);
 
 		if (*beg + min > boundary) {	/* search for a zero bit fails or the rest of bitmap block
 						 * cannot contain a zero window of minimum size */
+			brelse(bh);
 			return 0;
 		}
 
@@ -183,7 +183,7 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
 		/* first zero bit found; we check next bits */
 		for (end = *beg + 1;; end++) {
 			if (end >= *beg + max || end >= boundary
-			    || reiserfs_test_le_bit(end, bi->bh->b_data)) {
+			    || reiserfs_test_le_bit(end, bh->b_data)) {
 				next = end;
 				break;
 			}
@@ -197,12 +197,12 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
 		 * (end) points to one bit after the window end */
 		if (end - *beg >= min) {	/* it seems we have found window of proper size */
 			int i;
-			reiserfs_prepare_for_journal(s, bi->bh, 1);
+			reiserfs_prepare_for_journal(s, bh, 1);
 			/* try to set all blocks used checking are they still free */
 			for (i = *beg; i < end; i++) {
 				/* It seems that we should not check in journal again. */
 				if (reiserfs_test_and_set_le_bit
-				    (i, bi->bh->b_data)) {
+				    (i, bh->b_data)) {
 					/* bit was set by another process
 					 * while we slept in prepare_for_journal() */
 					PROC_INFO_INC(s, scan_bitmap.stolen);
@@ -214,17 +214,16 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
 					/* otherwise we clear all bit were set ... */
 					while (--i >= *beg)
 						reiserfs_test_and_clear_le_bit
-						    (i, bi->bh->b_data);
-					reiserfs_restore_prepared_buffer(s,
-									 bi->
-									 bh);
+						    (i, bh->b_data);
+					reiserfs_restore_prepared_buffer(s, bh);
 					*beg = org;
 					/* ... and search again in current block from beginning */
 					goto cont;
 				}
 			}
 			bi->free_count -= (end - *beg);
-			journal_mark_dirty(th, s, bi->bh);
+			journal_mark_dirty(th, s, bh);
+			brelse(bh);
 
 			/* free block count calculation */
 			reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
@@ -266,9 +265,20 @@ static int bmap_hash_id(struct super_block *s, u32 id)
  */
 static inline int block_group_used(struct super_block *s, u32 id)
 {
-	int bm;
-	bm = bmap_hash_id(s, id);
-	if (SB_AP_BITMAP(s)[bm].free_count > ((s->s_blocksize << 3) * 60 / 100)) {
+	int bm = bmap_hash_id(s, id);
+	struct reiserfs_bitmap_info *info = &SB_AP_BITMAP(s)[bm];
+
+	/* If we don't have cached information on this bitmap block, we're
+	 * going to have to load it later anyway. Loading it here allows us
+	 * to make a better decision. This favors long-term performace gain
+	 * with a better on-disk layout vs. a short term gain of skipping the
+	 * read and potentially having a bad placement. */
+	if (info->first_zero_hint == 0) {
+		struct buffer_head *bh = reiserfs_read_bitmap_block(s, bm);
+		brelse(bh);
+	}
+
+	if (info->free_count > ((s->s_blocksize << 3) * 60 / 100)) {
 		return 0;
 	}
 	return 1;
@@ -373,7 +383,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
 {
 	struct super_block *s = th->t_super;
 	struct reiserfs_super_block *rs;
-	struct buffer_head *sbh;
+	struct buffer_head *sbh, *bmbh;
 	struct reiserfs_bitmap_info *apbi;
 	int nr, offset;
 
@@ -394,16 +404,21 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
 		return;
 	}
 
-	reiserfs_prepare_for_journal(s, apbi[nr].bh, 1);
+	bmbh = reiserfs_read_bitmap_block(s, nr);
+	if (!bmbh)
+		return;
+
+	reiserfs_prepare_for_journal(s, bmbh, 1);
 
 	/* clear bit for the given block in bit map */
-	if (!reiserfs_test_and_clear_le_bit(offset, apbi[nr].bh->b_data)) {
+	if (!reiserfs_test_and_clear_le_bit(offset, bmbh->b_data)) {
 		reiserfs_warning(s, "vs-4080: reiserfs_free_block: "
 				 "free_block (%s:%lu)[dev:blocknr]: bit already cleared",
 				 reiserfs_bdevname(s), block);
 	}
 	apbi[nr].free_count++;
-	journal_mark_dirty(th, s, apbi[nr].bh);
+	journal_mark_dirty(th, s, bmbh);
+	brelse(bmbh);
 
 	reiserfs_prepare_for_journal(s, sbh, 1);
 	/* update super block */
@@ -1019,7 +1034,6 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
 	b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
 	int passno = 0;
 	int nr_allocated = 0;
-	int bigalloc = 0;
 
 	determine_prealloc_size(hint);
 	if (!hint->formatted_node) {
@@ -1046,28 +1060,9 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
 				hint->preallocate = hint->prealloc_size = 0;
 		}
 		/* for unformatted nodes, force large allocations */
-		bigalloc = amount_needed;
 	}
 
 	do {
-		/* in bigalloc mode, nr_allocated should stay zero until
-		 * the entire allocation is filled
-		 */
-		if (unlikely(bigalloc && nr_allocated)) {
-			reiserfs_warning(s, "bigalloc is %d, nr_allocated %d\n",
-					 bigalloc, nr_allocated);
-			/* reset things to a sane value */
-			bigalloc = amount_needed - nr_allocated;
-		}
-		/*
-		 * try pass 0 and pass 1 looking for a nice big
-		 * contiguous allocation.  Then reset and look
-		 * for anything you can find.
-		 */
-		if (passno == 2 && bigalloc) {
-			passno = 0;
-			bigalloc = 0;
-		}
 		switch (passno++) {
 		case 0:	/* Search from hint->search_start to end of disk */
 			start = hint->search_start;
@@ -1105,8 +1100,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
 								 new_blocknrs +
 								 nr_allocated,
 								 start, finish,
-								 bigalloc ?
-								 bigalloc : 1,
+								 1,
 								 amount_needed -
 								 nr_allocated,
 								 hint->
@@ -1263,3 +1257,89 @@ int reiserfs_can_fit_pages(struct super_block *sb	/* superblock of filesystem
 
 	return space > 0 ? space : 0;
 }
+
+void reiserfs_cache_bitmap_metadata(struct super_block *sb,
+                                    struct buffer_head *bh,
+                                    struct reiserfs_bitmap_info *info)
+{
+	unsigned long *cur = (unsigned long *)(bh->b_data + bh->b_size);
+
+	info->first_zero_hint = 1 << (sb->s_blocksize_bits + 3);
+
+	while (--cur >= (unsigned long *)bh->b_data) {
+		int base = ((char *)cur - bh->b_data) << 3;
+
+		/* 0 and ~0 are special, we can optimize for them */
+		if (*cur == 0) {
+			info->first_zero_hint = base;
+			info->free_count += BITS_PER_LONG;
+		} else if (*cur != ~0L) {       /* A mix, investigate */
+			int b;
+			for (b = BITS_PER_LONG - 1; b >= 0; b--) {
+				if (!reiserfs_test_le_bit(b, cur)) {
+					info->first_zero_hint = base + b;
+					info->free_count++;
+				}
+			}
+		}
+	}
+	/* The first bit must ALWAYS be 1 */
+	BUG_ON(info->first_zero_hint == 0);
+}
+
+struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
+                                               unsigned int bitmap)
+{
+	b_blocknr_t block = (sb->s_blocksize << 3) * bitmap;
+	struct reiserfs_bitmap_info *info = SB_AP_BITMAP(sb) + bitmap;
+	struct buffer_head *bh;
+
+	/* Way old format filesystems had the bitmaps packed up front.
+	 * I doubt there are any of these left, but just in case... */
+	if (unlikely(test_bit(REISERFS_OLD_FORMAT,
+	                      &(REISERFS_SB(sb)->s_properties))))
+		block = REISERFS_SB(sb)->s_sbh->b_blocknr + 1 + bitmap;
+	else if (bitmap == 0)
+		block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
+
+	bh = sb_bread(sb, block);
+	if (bh == NULL)
+		reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
+		                 "reading failed", __FUNCTION__, block);
+	else {
+		if (buffer_locked(bh)) {
+			PROC_INFO_INC(sb, scan_bitmap.wait);
+			__wait_on_buffer(bh);
+		}
+		BUG_ON(!buffer_uptodate(bh));
+		BUG_ON(atomic_read(&bh->b_count) == 0);
+
+		if (info->first_zero_hint == 0)
+			reiserfs_cache_bitmap_metadata(sb, bh, info);
+	}
+
+	return bh;
+}
+
+int reiserfs_init_bitmap_cache(struct super_block *sb)
+{
+	struct reiserfs_bitmap_info *bitmap;
+
+	bitmap = vmalloc(sizeof (*bitmap) * SB_BMAP_NR(sb));
+	if (bitmap == NULL)
+		return -ENOMEM;
+
+	memset(bitmap, 0, sizeof (*bitmap) * SB_BMAP_NR(sb));
+
+	SB_AP_BITMAP(sb) = bitmap;
+
+	return 0;
+}
+
+void reiserfs_free_bitmap_cache(struct super_block *sb)
+{
+	if (SB_AP_BITMAP(sb)) {
+		vfree(SB_AP_BITMAP(sb));
+		SB_AP_BITMAP(sb) = NULL;
+	}
+}
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 9aabcc0ccd2d..657050ad7430 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -22,6 +22,9 @@ const struct file_operations reiserfs_dir_operations = {
 	.readdir = reiserfs_readdir,
 	.fsync = reiserfs_dir_fsync,
 	.ioctl = reiserfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = reiserfs_compat_ioctl,
+#endif
 };
 
 static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1627edd50810..b67ce9354048 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -37,8 +37,7 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
 	int err;
 	int jbegin_failure = 0;
 
-	if (!S_ISREG(inode->i_mode))
-		BUG();
+	BUG_ON(!S_ISREG(inode->i_mode));
 
 	/* fast out for when nothing needs to be done */
 	if ((atomic_read(&inode->i_count) > 1 ||
@@ -124,13 +123,12 @@ static int reiserfs_sync_file(struct file *p_s_filp,
 	int n_err;
 	int barrier_done;
 
-	if (!S_ISREG(p_s_inode->i_mode))
-		BUG();
+	BUG_ON(!S_ISREG(p_s_inode->i_mode));
 	n_err = sync_mapping_buffers(p_s_inode->i_mapping);
 	reiserfs_write_lock(p_s_inode->i_sb);
 	barrier_done = reiserfs_commit_for_inode(p_s_inode);
 	reiserfs_write_unlock(p_s_inode->i_sb);
-	if (barrier_done != 1)
+	if (barrier_done != 1 && reiserfs_barrier_flush(p_s_inode->i_sb))
 		blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
 	if (barrier_done < 0)
 		return barrier_done;
@@ -1333,7 +1331,7 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 			if (err)
 				return err;
 		}
-		result = generic_file_write(file, buf, count, ppos);
+		result = do_sync_write(file, buf, count, ppos);
 
 		if (after_file_end) {	/* Now update i_size and remove the savelink */
 			struct reiserfs_transaction_handle th;
@@ -1565,10 +1563,14 @@ static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going t
 }
 
 const struct file_operations reiserfs_file_operations = {
-	.read = generic_file_read,
+	.read = do_sync_read,
 	.write = reiserfs_file_write,
 	.ioctl = reiserfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = reiserfs_compat_ioctl,
+#endif
 	.mmap = generic_file_mmap,
+	.open = generic_file_open,
 	.release = reiserfs_file_release,
 	.fsync = reiserfs_sync_file,
 	.sendfile = generic_file_sendfile,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 52f1e2136546..9c69bcacad22 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -17,8 +17,6 @@
 #include <linux/writeback.h>
 #include <linux/quotaops.h>
 
-extern int reiserfs_default_io_size;	/* default io size devuned in super.c */
-
 static int reiserfs_commit_write(struct file *f, struct page *page,
 				 unsigned from, unsigned to);
 static int reiserfs_prepare_write(struct file *f, struct page *page,
@@ -1122,7 +1120,6 @@ static void init_inode(struct inode *inode, struct path *path)
 	ih = PATH_PITEM_HEAD(path);
 
 	copy_key(INODE_PKEY(inode), &(ih->ih_key));
-	inode->i_blksize = reiserfs_default_io_size;
 
 	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
 	REISERFS_I(inode)->i_flags = 0;
@@ -1130,9 +1127,9 @@ static void init_inode(struct inode *inode, struct path *path)
 	REISERFS_I(inode)->i_prealloc_count = 0;
 	REISERFS_I(inode)->i_trans_id = 0;
 	REISERFS_I(inode)->i_jl = NULL;
-	REISERFS_I(inode)->i_acl_access = NULL;
-	REISERFS_I(inode)->i_acl_default = NULL;
-	init_rwsem(&REISERFS_I(inode)->xattr_sem);
+	reiserfs_init_acl_access(inode);
+	reiserfs_init_acl_default(inode);
+	reiserfs_init_xattr_rwsem(inode);
 
 	if (stat_data_v1(ih)) {
 		struct stat_data_v1 *sd =
@@ -1783,7 +1780,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		err = -EDQUOT;
 		goto out_end_trans;
 	}
-	if (!dir || !dir->i_nlink) {
+	if (!dir->i_nlink) {
 		err = -EPERM;
 		goto out_bad_inode;
 	}
@@ -1837,9 +1834,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	REISERFS_I(inode)->i_attrs =
 	    REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
 	sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
-	REISERFS_I(inode)->i_acl_access = NULL;
-	REISERFS_I(inode)->i_acl_default = NULL;
-	init_rwsem(&REISERFS_I(inode)->xattr_sem);
+	reiserfs_init_acl_access(inode);
+	reiserfs_init_acl_default(inode);
+	reiserfs_init_xattr_rwsem(inode);
 
 	if (old_format_only(sb))
 		make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
@@ -1877,7 +1874,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	}
 	// these do not go to on-disk stat data
 	inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
-	inode->i_blksize = reiserfs_default_io_size;
 
 	// store in in-core inode the key of stat data and version all
 	// object items will have (directory items will have old offset
@@ -1978,11 +1974,13 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	 * iput doesn't deadlock in reiserfs_delete_xattrs. The locking
 	 * code really needs to be reworked, but this will take care of it
 	 * for now. -jeffm */
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
 	if (REISERFS_I(dir)->i_acl_default && !IS_ERR(REISERFS_I(dir)->i_acl_default)) {
 		reiserfs_write_unlock_xattrs(dir->i_sb);
 		iput(inode);
 		reiserfs_write_lock_xattrs(dir->i_sb);
 	} else
+#endif
 		iput(inode);
 	return err;
 }
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index a986b5e1e288..9c57578cb831 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -9,6 +9,7 @@
 #include <asm/uaccess.h>
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
+#include <linux/compat.h>
 
 static int reiserfs_unpack(struct inode *inode, struct file *filp);
 
@@ -94,6 +95,40 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 	}
 }
 
+#ifdef CONFIG_COMPAT
+long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
+				unsigned long arg)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	int ret;
+
+	/* These are just misnamed, they actually get/put from/to user an int */
+	switch (cmd) {
+	case REISERFS_IOC32_UNPACK:
+		cmd = REISERFS_IOC_UNPACK;
+		break;
+	case REISERFS_IOC32_GETFLAGS:
+		cmd = REISERFS_IOC_GETFLAGS;
+		break;
+	case REISERFS_IOC32_SETFLAGS:
+		cmd = REISERFS_IOC_SETFLAGS;
+		break;
+	case REISERFS_IOC32_GETVERSION:
+		cmd = REISERFS_IOC_GETVERSION;
+		break;
+	case REISERFS_IOC32_SETVERSION:
+		cmd = REISERFS_IOC_SETVERSION;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	}
+	lock_kernel();
+	ret = reiserfs_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
+	unlock_kernel();
+	return ret;
+}
+#endif
+
 /*
 ** reiserfs_unpack
 ** Function try to convert tail from direct item into indirect.
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index 7a88adbceef6..b9b423b22a8b 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -75,8 +75,7 @@ static int sd_create_vi(struct virtual_node *vn,
 static int sd_check_left(struct virtual_item *vi, int free,
 			 int start_skip, int end_skip)
 {
-	if (start_skip || end_skip)
-		BUG();
+	BUG_ON(start_skip || end_skip);
 	return -1;
 }
 
@@ -87,8 +86,7 @@ static int sd_check_right(struct virtual_item *vi, int free)
 
 static int sd_part_size(struct virtual_item *vi, int first, int count)
 {
-	if (count)
-		BUG();
+	BUG_ON(count);
 	return 0;
 }
 
@@ -476,8 +474,7 @@ static int direntry_create_vi(struct virtual_node *vn,
 
 	vi->vi_index = TYPE_DIRENTRY;
 
-	if (!(vi->vi_ih) || !vi->vi_item)
-		BUG();
+	BUG_ON(!(vi->vi_ih) || !vi->vi_item);
 
 	dir_u->flags = 0;
 	if (le_ih_k_offset(vi->vi_ih) == DOT_OFFSET)
@@ -575,8 +572,7 @@ static int direntry_check_right(struct virtual_item *vi, int free)
 		free -= dir_u->entry_sizes[i];
 		entries++;
 	}
-	if (entries == dir_u->entry_count)
-		BUG();
+	BUG_ON(entries == dir_u->entry_count);
 
 	/* "." and ".." can not be separated from each other */
 	if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM)
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 9b3672d69367..85ce23268302 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -53,6 +53,7 @@
 #include <linux/workqueue.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 
 /* gets a struct reiserfs_journal_list * from a list head */
 #define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
@@ -718,8 +719,7 @@ static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
 			spinlock_t * lock, void (fn) (struct buffer_chunk *))
 {
 	int ret = 0;
-	if (chunk->nr >= CHUNK_SIZE)
-		BUG();
+	BUG_ON(chunk->nr >= CHUNK_SIZE);
 	chunk->bh[chunk->nr++] = bh;
 	if (chunk->nr >= CHUNK_SIZE) {
 		ret = 1;
@@ -788,8 +788,7 @@ static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
 		/* buffer must be locked for __add_jh, should be able to have
 		 * two adds at the same time
 		 */
-		if (bh->b_private)
-			BUG();
+		BUG_ON(bh->b_private);
 		jh->bh = bh;
 		bh->b_private = jh;
 	}
@@ -972,7 +971,7 @@ int reiserfs_async_progress_wait(struct super_block *s)
 	DEFINE_WAIT(wait);
 	struct reiserfs_journal *j = SB_JOURNAL(s);
 	if (atomic_read(&j->j_async_throttle))
-		blk_congestion_wait(WRITE, HZ / 10);
+		congestion_wait(WRITE, HZ / 10);
 	return 0;
 }
 
@@ -1186,6 +1185,21 @@ static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
 	return NULL;
 }
 
+static int newer_jl_done(struct reiserfs_journal_cnode *cn)
+{
+	struct super_block *sb = cn->sb;
+	b_blocknr_t blocknr = cn->blocknr;
+
+	cn = cn->hprev;
+	while (cn) {
+		if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist &&
+		    atomic_read(&cn->jlist->j_commit_left) != 0)
+				    return 0;
+		cn = cn->hprev;
+	}
+	return 1;
+}
+
 static void remove_journal_hash(struct super_block *,
 				struct reiserfs_journal_cnode **,
 				struct reiserfs_journal_list *, unsigned long,
@@ -1604,6 +1618,31 @@ static int flush_journal_list(struct super_block *s,
 	return err;
 }
 
+static int test_transaction(struct super_block *s,
+                            struct reiserfs_journal_list *jl)
+{
+	struct reiserfs_journal_cnode *cn;
+
+	if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0)
+		return 1;
+
+	cn = jl->j_realblock;
+	while (cn) {
+		/* if the blocknr == 0, this has been cleared from the hash,
+		 ** skip it
+		 */
+		if (cn->blocknr == 0) {
+			goto next;
+		}
+		if (cn->bh && !newer_jl_done(cn))
+			return 0;
+	      next:
+		cn = cn->next;
+		cond_resched();
+	}
+	return 0;
+}
+
 static int write_one_transaction(struct super_block *s,
 				 struct reiserfs_journal_list *jl,
 				 struct buffer_chunk *chunk)
@@ -2927,8 +2966,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
 	int retval;
 
 	reiserfs_check_lock_depth(p_s_sb, "journal_begin");
-	if (nblocks > journal->j_trans_max)
-		BUG();
+	BUG_ON(nblocks > journal->j_trans_max);
 
 	PROC_INFO_INC(p_s_sb, journal.journal_being);
 	/* set here for journal_join */
@@ -3044,9 +3082,8 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
 	if (reiserfs_transaction_running(s)) {
 		th = current->journal_info;
 		th->t_refcount++;
-		if (th->t_refcount < 2) {
-			BUG();
-		}
+		BUG_ON(th->t_refcount < 2);
+		
 		return th;
 	}
 	th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS);
@@ -3086,9 +3123,7 @@ static int journal_join(struct reiserfs_transaction_handle *th,
 	 ** pointer
 	 */
 	th->t_handle_save = cur_th;
-	if (cur_th && cur_th->t_refcount > 1) {
-		BUG();
-	}
+	BUG_ON(cur_th && cur_th->t_refcount > 1);
 	return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_JOIN);
 }
 
@@ -3101,9 +3136,7 @@ int journal_join_abort(struct reiserfs_transaction_handle *th,
 	 ** pointer
 	 */
 	th->t_handle_save = cur_th;
-	if (cur_th && cur_th->t_refcount > 1) {
-		BUG();
-	}
+	BUG_ON(cur_th && cur_th->t_refcount > 1);
 	return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_ABORT);
 }
 
@@ -3138,8 +3171,7 @@ int journal_begin(struct reiserfs_transaction_handle *th,
 		current->journal_info = th;
 	}
 	ret = do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_REG);
-	if (current->journal_info != th)
-		BUG();
+	BUG_ON(current->journal_info != th);
 
 	/* I guess this boils down to being the reciprocal of clm-2100 above.
 	 * If do_journal_begin_r fails, we need to put it back, since journal_end
@@ -3284,8 +3316,7 @@ int journal_end(struct reiserfs_transaction_handle *th,
 		/* we aren't allowed to close a nested transaction on a different
 		 ** filesystem from the one in the task struct
 		 */
-		if (cur_th->t_super != th->t_super)
-			BUG();
+		BUG_ON(cur_th->t_super != th->t_super);
 
 		if (th != cur_th) {
 			memcpy(current->journal_info, th, sizeof(*th));
@@ -3404,9 +3435,7 @@ int journal_end_sync(struct reiserfs_transaction_handle *th,
 
 	BUG_ON(!th->t_trans_id);
 	/* you can sync while nested, very, very bad */
-	if (th->t_refcount > 1) {
-		BUG();
-	}
+	BUG_ON(th->t_refcount > 1);
 	if (journal->j_len == 0) {
 		reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb),
 					     1);
@@ -3433,16 +3462,6 @@ static void flush_async_commits(void *p)
 		flush_commit_list(p_s_sb, jl, 1);
 	}
 	unlock_kernel();
-	/*
-	 * this is a little racey, but there's no harm in missing
-	 * the filemap_fdata_write
-	 */
-	if (!atomic_read(&journal->j_async_throttle)
-	    && !reiserfs_is_journal_aborted(journal)) {
-		atomic_inc(&journal->j_async_throttle);
-		filemap_fdatawrite(p_s_sb->s_bdev->bd_inode->i_mapping);
-		atomic_dec(&journal->j_async_throttle);
-	}
 }
 
 /*
@@ -3526,9 +3545,8 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
 	 ** will be dealt with by next transaction that actually writes something, but should be taken
 	 ** care of in this trans
 	 */
-	if (journal->j_len == 0) {
-		BUG();
-	}
+	BUG_ON(journal->j_len == 0);
+
 	/* if wcount > 0, and we are called to with flush or commit_now,
 	 ** we wait on j_join_wait.  We will wake up when the last writer has
 	 ** finished the transaction, and started it on its way to the disk.
@@ -3562,9 +3580,8 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
 					unlock_journal(p_s_sb);
 				}
 			}
-			if (journal->j_trans_id == trans_id) {
-				BUG();
-			}
+			BUG_ON(journal->j_trans_id == trans_id);
+			
 			if (commit_now
 			    && journal_list_still_alive(p_s_sb, trans_id)
 			    && wait_on_commit) {
@@ -3844,7 +3861,9 @@ static void flush_old_journal_lists(struct super_block *s)
 		entry = journal->j_journal_list.next;
 		jl = JOURNAL_LIST_ENTRY(entry);
 		/* this check should always be run, to send old lists to disk */
-		if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) {
+		if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4)) &&
+		    atomic_read(&jl->j_commit_left) == 0 &&
+		    test_transaction(s, jl)) {
 			flush_used_journal_lists(s, jl);
 		} else {
 			break;
@@ -4042,9 +4061,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 	set_commit_trans_len(commit, journal->j_len);
 
 	/* special check in case all buffers in the journal were marked for not logging */
-	if (journal->j_len == 0) {
-		BUG();
-	}
+	BUG_ON(journal->j_len == 0);
 
 	/* we're about to dirty all the log blocks, mark the description block
 	 * dirty now too.  Don't mark the commit block dirty until all the
@@ -4141,8 +4158,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
 				      journal, jl, &jl->j_tail_bh_list);
 		lock_kernel();
 	}
-	if (!list_empty(&jl->j_tail_bh_list))
-		BUG();
+	BUG_ON(!list_empty(&jl->j_tail_bh_list));
 	up(&jl->j_commit_lock);
 
 	/* honor the flush wishes from the caller, simple commits can
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index c61710e49c62..abde1edc2235 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -19,8 +19,8 @@
 #include <linux/smp_lock.h>
 #include <linux/quotaops.h>
 
-#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { i->i_nlink++; if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; }
-#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) i->i_nlink--;
+#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; }
+#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i);
 
 // directory item contains array of entry headers. This performs
 // binary search through that array
@@ -67,8 +67,7 @@ inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de)
 {
 	struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
 
-	if (de->de_entry_num >= ih_entry_count(de->de_ih))
-		BUG();
+	BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
 
 	de->de_entrylen = entry_length(de->de_bh, de->de_ih, de->de_entry_num);
 	de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0);
@@ -80,8 +79,7 @@ inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de)
 // what entry points to
 static inline void set_de_object_key(struct reiserfs_dir_entry *de)
 {
-	if (de->de_entry_num >= ih_entry_count(de->de_ih))
-		BUG();
+	BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
 	de->de_dir_id = deh_dir_id(&(de->de_deh[de->de_entry_num]));
 	de->de_objectid = deh_objectid(&(de->de_deh[de->de_entry_num]));
 }
@@ -90,8 +88,7 @@ static inline void store_de_entry_key(struct reiserfs_dir_entry *de)
 {
 	struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
 
-	if (de->de_entry_num >= ih_entry_count(de->de_ih))
-		BUG();
+	BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
 
 	/* store key of the found entry */
 	de->de_entry_key.version = KEY_FORMAT_3_5;
@@ -913,7 +910,7 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
 		reiserfs_warning(inode->i_sb, "%s: empty directory has nlink "
 				 "!= 2 (%d)", __FUNCTION__, inode->i_nlink);
 
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
 	reiserfs_update_sd(&th, inode);
 
@@ -994,7 +991,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
 		inode->i_nlink = 1;
 	}
 
-	inode->i_nlink--;
+	drop_nlink(inode);
 
 	/*
 	 * we schedule before doing the add_save_link call, save the link
@@ -1006,7 +1003,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
 	    reiserfs_cut_from_item(&th, &path, &(de.de_entry_key), dir, NULL,
 				   0);
 	if (retval < 0) {
-		inode->i_nlink++;
+		inc_nlink(inode);
 		goto end_unlink;
 	}
 	inode->i_ctime = CURRENT_TIME_SEC;
@@ -1143,7 +1140,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
 	}
 
 	/* inc before scheduling so reiserfs_unlink knows we are here */
-	inode->i_nlink++;
+	inc_nlink(inode);
 
 	retval = journal_begin(&th, dir->i_sb, jbegin_count);
 	if (retval) {
@@ -1473,9 +1470,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (new_dentry_inode) {
 		// adjust link number of the victim
 		if (S_ISDIR(new_dentry_inode->i_mode)) {
-			new_dentry_inode->i_nlink = 0;
+			clear_nlink(new_dentry_inode);
 		} else {
-			new_dentry_inode->i_nlink--;
+			drop_nlink(new_dentry_inode);
 		}
 		new_dentry_inode->i_ctime = ctime;
 		savelink = new_dentry_inode->i_nlink;
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 39cc7f47f5dc..315684793d1d 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -22,6 +22,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
 	int err = 0;
 	struct reiserfs_super_block *sb;
 	struct reiserfs_bitmap_info *bitmap;
+	struct reiserfs_bitmap_info *info;
 	struct reiserfs_bitmap_info *old_bitmap = SB_AP_BITMAP(s);
 	struct buffer_head *bh;
 	struct reiserfs_transaction_handle th;
@@ -127,16 +128,20 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
 		 * transaction begins, and the new bitmaps don't matter if the
 		 * transaction fails. */
 		for (i = bmap_nr; i < bmap_nr_new; i++) {
-			bitmap[i].bh = sb_getblk(s, i * s->s_blocksize * 8);
-			memset(bitmap[i].bh->b_data, 0, sb_blocksize(sb));
-			reiserfs_test_and_set_le_bit(0, bitmap[i].bh->b_data);
-
-			set_buffer_uptodate(bitmap[i].bh);
-			mark_buffer_dirty(bitmap[i].bh);
-			sync_dirty_buffer(bitmap[i].bh);
+			/* don't use read_bitmap_block since it will cache
+			 * the uninitialized bitmap */
+			bh = sb_bread(s, i * s->s_blocksize * 8);
+			memset(bh->b_data, 0, sb_blocksize(sb));
+			reiserfs_test_and_set_le_bit(0, bh->b_data);
+			reiserfs_cache_bitmap_metadata(s, bh, bitmap + i);
+
+			set_buffer_uptodate(bh);
+			mark_buffer_dirty(bh);
+			sync_dirty_buffer(bh);
 			// update bitmap_info stuff
 			bitmap[i].first_zero_hint = 1;
 			bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
+			brelse(bh);
 		}
 		/* free old bitmap blocks array */
 		SB_AP_BITMAP(s) = bitmap;
@@ -150,30 +155,46 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
 	if (err)
 		return err;
 
-	/* correct last bitmap blocks in old and new disk layout */
-	reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[bmap_nr - 1].bh, 1);
+	/* Extend old last bitmap block - new blocks have been made available */
+	info = SB_AP_BITMAP(s) + bmap_nr - 1;
+	bh = reiserfs_read_bitmap_block(s, bmap_nr - 1);
+	if (!bh) {
+		int jerr = journal_end(&th, s, 10);
+		if (jerr)
+			return jerr;
+		return -EIO;
+	}
+
+	reiserfs_prepare_for_journal(s, bh, 1);
 	for (i = block_r; i < s->s_blocksize * 8; i++)
-		reiserfs_test_and_clear_le_bit(i,
-					       SB_AP_BITMAP(s)[bmap_nr -
-							       1].bh->b_data);
-	SB_AP_BITMAP(s)[bmap_nr - 1].free_count += s->s_blocksize * 8 - block_r;
-	if (!SB_AP_BITMAP(s)[bmap_nr - 1].first_zero_hint)
-		SB_AP_BITMAP(s)[bmap_nr - 1].first_zero_hint = block_r;
+		reiserfs_test_and_clear_le_bit(i, bh->b_data);
+	info->free_count += s->s_blocksize * 8 - block_r;
+	if (!info->first_zero_hint)
+		info->first_zero_hint = block_r;
 
-	journal_mark_dirty(&th, s, SB_AP_BITMAP(s)[bmap_nr - 1].bh);
+	journal_mark_dirty(&th, s, bh);
+	brelse(bh);
+
+	/* Correct new last bitmap block - It may not be full */
+	info = SB_AP_BITMAP(s) + bmap_nr_new - 1;
+	bh = reiserfs_read_bitmap_block(s, bmap_nr_new - 1);
+	if (!bh) {
+		int jerr = journal_end(&th, s, 10);
+		if (jerr)
+			return jerr;
+		return -EIO;
+	}
 
-	reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[bmap_nr_new - 1].bh, 1);
+	reiserfs_prepare_for_journal(s, bh, 1);
 	for (i = block_r_new; i < s->s_blocksize * 8; i++)
-		reiserfs_test_and_set_le_bit(i,
-					     SB_AP_BITMAP(s)[bmap_nr_new -
-							     1].bh->b_data);
-	journal_mark_dirty(&th, s, SB_AP_BITMAP(s)[bmap_nr_new - 1].bh);
+		reiserfs_test_and_set_le_bit(i, bh->b_data);
+	journal_mark_dirty(&th, s, bh);
+	brelse(bh);
 
-	SB_AP_BITMAP(s)[bmap_nr_new - 1].free_count -=
-	    s->s_blocksize * 8 - block_r_new;
+	info->free_count -= s->s_blocksize * 8 - block_r_new;
 	/* Extreme case where last bitmap is the only valid block in itself. */
-	if (!SB_AP_BITMAP(s)[bmap_nr_new - 1].free_count)
-		SB_AP_BITMAP(s)[bmap_nr_new - 1].first_zero_hint = 0;
+	if (!info->free_count)
+		info->first_zero_hint = 0;
 	/* update super */
 	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
 	free_blocks = SB_FREE_BLOCKS(s);
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 8b9b13127136..5240abe1a709 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1476,9 +1476,7 @@ static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
 	int n_block_size = p_s_sb->s_blocksize;
 	int cut_bytes;
 	BUG_ON(!th->t_trans_id);
-
-	if (n_new_file_size != p_s_inode->i_size)
-		BUG();
+	BUG_ON(n_new_file_size != p_s_inode->i_size);
 
 	/* the page being sent in could be NULL if there was an i/o error
 	 ** reading in the last block.  The user will hit problems trying to
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 5567328f1041..9041802df832 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -430,21 +430,29 @@ int remove_save_link(struct inode *inode, int truncate)
 	return journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
 }
 
-static void reiserfs_put_super(struct super_block *s)
+static void reiserfs_kill_sb(struct super_block *s)
 {
-	int i;
-	struct reiserfs_transaction_handle th;
-	th.t_trans_id = 0;
+	if (REISERFS_SB(s)) {
+		if (REISERFS_SB(s)->xattr_root) {
+			d_invalidate(REISERFS_SB(s)->xattr_root);
+			dput(REISERFS_SB(s)->xattr_root);
+			REISERFS_SB(s)->xattr_root = NULL;
+		}
 
-	if (REISERFS_SB(s)->xattr_root) {
-		d_invalidate(REISERFS_SB(s)->xattr_root);
-		dput(REISERFS_SB(s)->xattr_root);
+		if (REISERFS_SB(s)->priv_root) {
+			d_invalidate(REISERFS_SB(s)->priv_root);
+			dput(REISERFS_SB(s)->priv_root);
+			REISERFS_SB(s)->priv_root = NULL;
+		}
 	}
 
-	if (REISERFS_SB(s)->priv_root) {
-		d_invalidate(REISERFS_SB(s)->priv_root);
-		dput(REISERFS_SB(s)->priv_root);
-	}
+	kill_block_super(s);
+}
+
+static void reiserfs_put_super(struct super_block *s)
+{
+	struct reiserfs_transaction_handle th;
+	th.t_trans_id = 0;
 
 	/* change file system state to current state if it was mounted with read-write permissions */
 	if (!(s->s_flags & MS_RDONLY)) {
@@ -462,10 +470,7 @@ static void reiserfs_put_super(struct super_block *s)
 	 */
 	journal_release(&th, s);
 
-	for (i = 0; i < SB_BMAP_NR(s); i++)
-		brelse(SB_AP_BITMAP(s)[i].bh);
-
-	vfree(SB_AP_BITMAP(s));
+	reiserfs_free_bitmap_cache(s);
 
 	brelse(SB_BUFFER_WITH_SB(s));
 
@@ -510,8 +515,10 @@ static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags)
 	    SLAB_CTOR_CONSTRUCTOR) {
 		INIT_LIST_HEAD(&ei->i_prealloc_list);
 		inode_init_once(&ei->vfs_inode);
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
 		ei->i_acl_access = NULL;
 		ei->i_acl_default = NULL;
+#endif
 	}
 }
 
@@ -530,9 +537,7 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(reiserfs_inode_cachep))
-		reiserfs_warning(NULL,
-				 "reiserfs_inode_cache: not all structures were freed");
+	kmem_cache_destroy(reiserfs_inode_cachep);
 }
 
 /* we don't mark inodes dirty, we just log them */
@@ -562,6 +567,7 @@ static void reiserfs_dirty_inode(struct inode *inode)
 	reiserfs_write_unlock(inode->i_sb);
 }
 
+#ifdef CONFIG_REISERFS_FS_POSIX_ACL
 static void reiserfs_clear_inode(struct inode *inode)
 {
 	struct posix_acl *acl;
@@ -576,6 +582,9 @@ static void reiserfs_clear_inode(struct inode *inode)
 		posix_acl_release(acl);
 	REISERFS_I(inode)->i_acl_default = NULL;
 }
+#else
+#define reiserfs_clear_inode NULL
+#endif
 
 #ifdef CONFIG_QUOTA
 static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
@@ -725,12 +734,6 @@ static const arg_desc_t error_actions[] = {
 	{NULL, 0, 0},
 };
 
-int reiserfs_default_io_size = 128 * 1024;	/* Default recommended I/O size is 128k.
-						   There might be broken applications that are
-						   confused by this. Use nolargeio mount option
-						   to get usual i/o size = PAGE_SIZE.
-						 */
-
 /* proceed only one option from a list *cur - string containing of mount options
    opts - array of options which are accepted
    opt_arg - if option is found and requires an argument and if it is specifed
@@ -959,19 +962,8 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 		}
 
 		if (c == 'w') {
-			char *p = NULL;
-			int val = simple_strtoul(arg, &p, 0);
-
-			if (*p != '\0') {
-				reiserfs_warning(s,
-						 "reiserfs_parse_options: non-numeric value %s for nolargeio option",
-						 arg);
-				return 0;
-			}
-			if (val)
-				reiserfs_default_io_size = PAGE_SIZE;
-			else
-				reiserfs_default_io_size = 128 * 1024;
+			reiserfs_warning(s, "reiserfs: nolargeio option is no longer supported");
+			return 0;
 		}
 
 		if (c == 'j') {
@@ -1256,118 +1248,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 	return 0;
 }
 
-/* load_bitmap_info_data - Sets up the reiserfs_bitmap_info structure from disk.
- * @sb - superblock for this filesystem
- * @bi - the bitmap info to be loaded. Requires that bi->bh is valid.
- *
- * This routine counts how many free bits there are, finding the first zero
- * as a side effect. Could also be implemented as a loop of test_bit() calls, or
- * a loop of find_first_zero_bit() calls. This implementation is similar to
- * find_first_zero_bit(), but doesn't return after it finds the first bit.
- * Should only be called on fs mount, but should be fairly efficient anyways.
- *
- * bi->first_zero_hint is considered unset if it == 0, since the bitmap itself
- * will * invariably occupt block 0 represented in the bitmap. The only
- * exception to this is when free_count also == 0, since there will be no
- * free blocks at all.
- */
-
-static void load_bitmap_info_data(struct super_block *sb,
-				  struct reiserfs_bitmap_info *bi)
-{
-	unsigned long *cur = (unsigned long *)bi->bh->b_data;
-
-	while ((char *)cur < (bi->bh->b_data + sb->s_blocksize)) {
-
-		/* No need to scan if all 0's or all 1's.
-		 * Since we're only counting 0's, we can simply ignore all 1's */
-		if (*cur == 0) {
-			if (bi->first_zero_hint == 0) {
-				bi->first_zero_hint =
-				    ((char *)cur - bi->bh->b_data) << 3;
-			}
-			bi->free_count += sizeof(unsigned long) * 8;
-		} else if (*cur != ~0L) {
-			int b;
-			for (b = 0; b < sizeof(unsigned long) * 8; b++) {
-				if (!reiserfs_test_le_bit(b, cur)) {
-					bi->free_count++;
-					if (bi->first_zero_hint == 0)
-						bi->first_zero_hint =
-						    (((char *)cur -
-						      bi->bh->b_data) << 3) + b;
-				}
-			}
-		}
-		cur++;
-	}
-
-#ifdef CONFIG_REISERFS_CHECK
-// This outputs a lot of unneded info on big FSes
-//    reiserfs_warning ("bitmap loaded from block %d: %d free blocks",
-//                    bi->bh->b_blocknr, bi->free_count);
-#endif
-}
-
-static int read_bitmaps(struct super_block *s)
-{
-	int i, bmap_nr;
-
-	SB_AP_BITMAP(s) =
-	    vmalloc(sizeof(struct reiserfs_bitmap_info) * SB_BMAP_NR(s));
-	if (SB_AP_BITMAP(s) == 0)
-		return 1;
-	memset(SB_AP_BITMAP(s), 0,
-	       sizeof(struct reiserfs_bitmap_info) * SB_BMAP_NR(s));
-	for (i = 0, bmap_nr =
-	     REISERFS_DISK_OFFSET_IN_BYTES / s->s_blocksize + 1;
-	     i < SB_BMAP_NR(s); i++, bmap_nr = s->s_blocksize * 8 * i) {
-		SB_AP_BITMAP(s)[i].bh = sb_getblk(s, bmap_nr);
-		if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh))
-			ll_rw_block(READ, 1, &SB_AP_BITMAP(s)[i].bh);
-	}
-	for (i = 0; i < SB_BMAP_NR(s); i++) {
-		wait_on_buffer(SB_AP_BITMAP(s)[i].bh);
-		if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) {
-			reiserfs_warning(s, "sh-2029: reiserfs read_bitmaps: "
-					 "bitmap block (#%lu) reading failed",
-					 SB_AP_BITMAP(s)[i].bh->b_blocknr);
-			for (i = 0; i < SB_BMAP_NR(s); i++)
-				brelse(SB_AP_BITMAP(s)[i].bh);
-			vfree(SB_AP_BITMAP(s));
-			SB_AP_BITMAP(s) = NULL;
-			return 1;
-		}
-		load_bitmap_info_data(s, SB_AP_BITMAP(s) + i);
-	}
-	return 0;
-}
-
-static int read_old_bitmaps(struct super_block *s)
-{
-	int i;
-	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
-	int bmp1 = (REISERFS_OLD_DISK_OFFSET_IN_BYTES / s->s_blocksize) + 1;	/* first of bitmap blocks */
-
-	/* read true bitmap */
-	SB_AP_BITMAP(s) =
-	    vmalloc(sizeof(struct reiserfs_buffer_info *) * sb_bmap_nr(rs));
-	if (SB_AP_BITMAP(s) == 0)
-		return 1;
-
-	memset(SB_AP_BITMAP(s), 0,
-	       sizeof(struct reiserfs_buffer_info *) * sb_bmap_nr(rs));
-
-	for (i = 0; i < sb_bmap_nr(rs); i++) {
-		SB_AP_BITMAP(s)[i].bh = sb_bread(s, bmp1 + i);
-		if (!SB_AP_BITMAP(s)[i].bh)
-			return 1;
-		load_bitmap_info_data(s, SB_AP_BITMAP(s) + i);
-	}
-
-	return 0;
-}
-
 static int read_super_block(struct super_block *s, int offset)
 {
 	struct buffer_head *bh;
@@ -1469,7 +1349,6 @@ static int read_super_block(struct super_block *s, int offset)
 /* after journal replay, reread all bitmap and super blocks */
 static int reread_meta_blocks(struct super_block *s)
 {
-	int i;
 	ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
 	wait_on_buffer(SB_BUFFER_WITH_SB(s));
 	if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
@@ -1478,20 +1357,7 @@ static int reread_meta_blocks(struct super_block *s)
 		return 1;
 	}
 
-	for (i = 0; i < SB_BMAP_NR(s); i++) {
-		ll_rw_block(READ, 1, &(SB_AP_BITMAP(s)[i].bh));
-		wait_on_buffer(SB_AP_BITMAP(s)[i].bh);
-		if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) {
-			reiserfs_warning(s,
-					 "reread_meta_blocks, error reading bitmap block number %d at %llu",
-					 i,
-					 (unsigned long long)SB_AP_BITMAP(s)[i].
-					 bh->b_blocknr);
-			return 1;
-		}
-	}
 	return 0;
-
 }
 
 /////////////////////////////////////////////////////
@@ -1672,7 +1538,6 @@ static int function2code(hashf_t func)
 static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 {
 	struct inode *root_inode;
-	int j;
 	struct reiserfs_transaction_handle th;
 	int old_format = 0;
 	unsigned long blocks;
@@ -1749,7 +1614,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	sbi->s_mount_state = SB_REISERFS_STATE(s);
 	sbi->s_mount_state = REISERFS_VALID_FS;
 
-	if (old_format ? read_old_bitmaps(s) : read_bitmaps(s)) {
+	if ((errval = reiserfs_init_bitmap_cache(s))) {
 		SWARN(silent, s,
 		      "jmacd-8: reiserfs_fill_super: unable to read bitmap");
 		goto error;
@@ -1831,6 +1696,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	if (is_reiserfs_3_5(rs)
 	    || (is_reiserfs_jr(rs) && SB_VERSION(s) == REISERFS_VERSION_1))
 		set_bit(REISERFS_3_5, &(sbi->s_properties));
+	else if (old_format)
+		set_bit(REISERFS_OLD_FORMAT, &(sbi->s_properties));
 	else
 		set_bit(REISERFS_3_6, &(sbi->s_properties));
 
@@ -1916,19 +1783,17 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 	if (jinit_done) {	/* kill the commit thread, free journal ram */
 		journal_release_error(NULL, s);
 	}
-	if (SB_DISK_SUPER_BLOCK(s)) {
-		for (j = 0; j < SB_BMAP_NR(s); j++) {
-			if (SB_AP_BITMAP(s))
-				brelse(SB_AP_BITMAP(s)[j].bh);
-		}
-		vfree(SB_AP_BITMAP(s));
-	}
+
+	reiserfs_free_bitmap_cache(s);
 	if (SB_BUFFER_WITH_SB(s))
 		brelse(SB_BUFFER_WITH_SB(s));
 #ifdef CONFIG_QUOTA
-	for (j = 0; j < MAXQUOTAS; j++) {
-		kfree(sbi->s_qf_names[j]);
-		sbi->s_qf_names[j] = NULL;
+	{
+		int j;
+		for (j = 0; j < MAXQUOTAS; j++) {
+			kfree(sbi->s_qf_names[j]);
+			sbi->s_qf_names[j] = NULL;
+		}
 	}
 #endif
 	kfree(sbi);
@@ -2300,7 +2165,7 @@ struct file_system_type reiserfs_fs_type = {
 	.owner = THIS_MODULE,
 	.name = "reiserfs",
 	.get_sb = get_super_block,
-	.kill_sb = kill_block_super,
+	.kill_sb = reiserfs_kill_sb,
 	.fs_flags = FS_REQUIRES_DEV,
 };
 
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index d935fb9394e3..7bdb0ed443e1 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -773,7 +773,7 @@ int reiserfs_xattr_del(struct inode *inode, const char *name)
 
 static int
 reiserfs_delete_xattrs_filler(void *buf, const char *name, int namelen,
-			      loff_t offset, ino_t ino, unsigned int d_type)
+			      loff_t offset, u64 ino, unsigned int d_type)
 {
 	struct dentry *xadir = (struct dentry *)buf;
 
@@ -851,7 +851,7 @@ struct reiserfs_chown_buf {
 /* XXX: If there is a better way to do this, I'd love to hear about it */
 static int
 reiserfs_chown_xattrs_filler(void *buf, const char *name, int namelen,
-			     loff_t offset, ino_t ino, unsigned int d_type)
+			     loff_t offset, u64 ino, unsigned int d_type)
 {
 	struct reiserfs_chown_buf *chown_buf = (struct reiserfs_chown_buf *)buf;
 	struct dentry *xafile, *xadir = chown_buf->xadir;
@@ -1036,7 +1036,7 @@ struct reiserfs_listxattr_buf {
 
 static int
 reiserfs_listxattr_filler(void *buf, const char *name, int namelen,
-			  loff_t offset, ino_t ino, unsigned int d_type)
+			  loff_t offset, u64 ino, unsigned int d_type)
 {
 	struct reiserfs_listxattr_buf *b = (struct reiserfs_listxattr_buf *)buf;
 	int len = 0;
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 22eed61ebf69..ddcd9e1ef282 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -589,8 +589,7 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(romfs_inode_cachep))
-		printk(KERN_INFO "romfs_inode_cache: not all structures were freed\n");
+	kmem_cache_destroy(romfs_inode_cachep);
 }
 
 static int romfs_remount(struct super_block *sb, int *flags, char *data)
diff --git a/fs/select.c b/fs/select.c
index 33b72ba0f86f..dcbc1112b7ec 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -658,8 +658,6 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
  	unsigned int i;
 	struct poll_list *head;
  	struct poll_list *walk;
-	struct fdtable *fdt;
-	int max_fdset;
 	/* Allocate small arguments on the stack to save memory and be
 	   faster - use long to make sure the buffer is aligned properly
 	   on 64 bit archs to avoid unaligned access */
@@ -667,11 +665,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout)
 	struct poll_list *stack_pp = NULL;
 
 	/* Do a sanity check on nfds ... */
-	rcu_read_lock();
-	fdt = files_fdtable(current->files);
-	max_fdset = fdt->max_fdset;
-	rcu_read_unlock();
-	if (nfds > max_fdset && nfds > OPEN_MAX)
+	if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
 		return -EINVAL;
 
 	poll_initwait(&table);
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index dae67048baba..50784d13c87b 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -214,13 +214,15 @@ smb_updatepage(struct file *file, struct page *page, unsigned long offset,
 }
 
 static ssize_t
-smb_file_read(struct file * file, char __user * buf, size_t count, loff_t *ppos)
+smb_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+			unsigned long nr_segs, loff_t pos)
 {
+	struct file * file = iocb->ki_filp;
 	struct dentry * dentry = file->f_dentry;
 	ssize_t	status;
 
 	VERBOSE("file %s/%s, count=%lu@%lu\n", DENTRY_PATH(dentry),
-		(unsigned long) count, (unsigned long) *ppos);
+		(unsigned long) iocb->ki_left, (unsigned long) pos);
 
 	status = smb_revalidate_inode(dentry);
 	if (status) {
@@ -233,7 +235,7 @@ smb_file_read(struct file * file, char __user * buf, size_t count, loff_t *ppos)
 		(long)dentry->d_inode->i_size,
 		dentry->d_inode->i_flags, dentry->d_inode->i_atime);
 
-	status = generic_file_read(file, buf, count, ppos);
+	status = generic_file_aio_read(iocb, iov, nr_segs, pos);
 out:
 	return status;
 }
@@ -317,14 +319,16 @@ const struct address_space_operations smb_file_aops = {
  * Write to a file (through the page cache).
  */
 static ssize_t
-smb_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+smb_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			       unsigned long nr_segs, loff_t pos)
 {
+	struct file * file = iocb->ki_filp;
 	struct dentry * dentry = file->f_dentry;
 	ssize_t	result;
 
 	VERBOSE("file %s/%s, count=%lu@%lu\n",
 		DENTRY_PATH(dentry),
-		(unsigned long) count, (unsigned long) *ppos);
+		(unsigned long) iocb->ki_left, (unsigned long) pos);
 
 	result = smb_revalidate_inode(dentry);
 	if (result) {
@@ -337,8 +341,8 @@ smb_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
 	if (result)
 		goto out;
 
-	if (count > 0) {
-		result = generic_file_write(file, buf, count, ppos);
+	if (iocb->ki_left > 0) {
+		result = generic_file_aio_write(iocb, iov, nr_segs, pos);
 		VERBOSE("pos=%ld, size=%ld, mtime=%ld, atime=%ld\n",
 			(long) file->f_pos, (long) dentry->d_inode->i_size,
 			dentry->d_inode->i_mtime, dentry->d_inode->i_atime);
@@ -402,8 +406,10 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
 const struct file_operations smb_file_operations =
 {
 	.llseek		= remote_llseek,
-	.read		= smb_file_read,
-	.write		= smb_file_write,
+	.read		= do_sync_read,
+	.aio_read	= smb_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= smb_file_aio_write,
 	.ioctl		= smb_ioctl,
 	.mmap		= smb_file_mmap,
 	.open		= smb_file_open,
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index a1ed657c3c84..2c122ee83adb 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -89,8 +89,7 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(smb_inode_cachep))
-		printk(KERN_INFO "smb_inode_cache: not all structures were freed\n");
+	kmem_cache_destroy(smb_inode_cachep);
 }
 
 static int smb_remount(struct super_block *sb, int *flags, char *data)
@@ -167,7 +166,6 @@ smb_get_inode_attr(struct inode *inode, struct smb_fattr *fattr)
 	fattr->f_mtime	= inode->i_mtime;
 	fattr->f_ctime	= inode->i_ctime;
 	fattr->f_atime	= inode->i_atime;
-	fattr->f_blksize= inode->i_blksize;
 	fattr->f_blocks	= inode->i_blocks;
 
 	fattr->attr	= SMB_I(inode)->attr;
@@ -201,7 +199,6 @@ smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr)
 	inode->i_uid	= fattr->f_uid;
 	inode->i_gid	= fattr->f_gid;
 	inode->i_ctime	= fattr->f_ctime;
-	inode->i_blksize= fattr->f_blksize;
 	inode->i_blocks = fattr->f_blocks;
 	inode->i_size	= fattr->f_size;
 	inode->i_mtime	= fattr->f_mtime;
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index c3495059889d..40e174db9872 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -1826,7 +1826,6 @@ smb_init_dirent(struct smb_sb_info *server, struct smb_fattr *fattr)
 	fattr->f_nlink = 1;
 	fattr->f_uid = server->mnt->uid;
 	fattr->f_gid = server->mnt->gid;
-	fattr->f_blksize = SMB_ST_BLKSIZE;
 	fattr->f_unix = 0;
 }
 
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
index c8e96195b96e..0fb74697abc4 100644
--- a/fs/smbfs/request.c
+++ b/fs/smbfs/request.c
@@ -49,8 +49,7 @@ int smb_init_request_cache(void)
 
 void smb_destroy_request_cache(void)
 {
-	if (kmem_cache_destroy(req_cachep))
-		printk(KERN_INFO "smb_destroy_request_cache: not all structures were freed\n");
+	kmem_cache_destroy(req_cachep);
 }
 
 /*
diff --git a/fs/splice.c b/fs/splice.c
index 684bca3d3a10..49fb9f129938 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -12,7 +12,7 @@
  * Jens to support splicing to files, network, direct splicing, etc and
  * fixing lots of bugs.
  *
- * Copyright (C) 2005-2006 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
  * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
  * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
  *
@@ -607,7 +607,7 @@ find_page:
 			ret = -ENOMEM;
 			page = page_cache_alloc_cold(mapping);
 			if (unlikely(!page))
-				goto out_nomem;
+				goto out_ret;
 
 			/*
 			 * This will also lock the page
@@ -666,7 +666,7 @@ find_page:
 		if (sd->pos + this_len > isize)
 			vmtruncate(mapping->host, isize);
 
-		goto out;
+		goto out_ret;
 	}
 
 	if (buf->page != page) {
@@ -698,7 +698,7 @@ find_page:
 out:
 	page_cache_release(page);
 	unlock_page(page);
-out_nomem:
+out_ret:
 	return ret;
 }
 
@@ -707,9 +707,9 @@ out_nomem:
  * key here is the 'actor' worker passed in that actually moves the data
  * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
  */
-ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
-			 loff_t *ppos, size_t len, unsigned int flags,
-			 splice_actor *actor)
+static ssize_t __splice_from_pipe(struct pipe_inode_info *pipe,
+				  struct file *out, loff_t *ppos, size_t len,
+				  unsigned int flags, splice_actor *actor)
 {
 	int ret, do_wakeup, err;
 	struct splice_desc sd;
@@ -722,9 +722,6 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 	sd.file = out;
 	sd.pos = *ppos;
 
-	if (pipe->inode)
-		mutex_lock(&pipe->inode->i_mutex);
-
 	for (;;) {
 		if (pipe->nrbufs) {
 			struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
@@ -797,9 +794,6 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 		pipe_wait(pipe);
 	}
 
-	if (pipe->inode)
-		mutex_unlock(&pipe->inode->i_mutex);
-
 	if (do_wakeup) {
 		smp_mb();
 		if (waitqueue_active(&pipe->wait))
@@ -810,6 +804,73 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 	return ret;
 }
 
+ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
+			 loff_t *ppos, size_t len, unsigned int flags,
+			 splice_actor *actor)
+{
+	ssize_t ret;
+	struct inode *inode = out->f_mapping->host;
+
+	/*
+	 * The actor worker might be calling ->prepare_write and
+	 * ->commit_write. Most of the time, these expect i_mutex to
+	 * be held. Since this may result in an ABBA deadlock with
+	 * pipe->inode, we have to order lock acquiry here.
+	 */
+	inode_double_lock(inode, pipe->inode);
+	ret = __splice_from_pipe(pipe, out, ppos, len, flags, actor);
+	inode_double_unlock(inode, pipe->inode);
+
+	return ret;
+}
+
+/**
+ * generic_file_splice_write_nolock - generic_file_splice_write without mutexes
+ * @pipe:	pipe info
+ * @out:	file to write to
+ * @len:	number of bytes to splice
+ * @flags:	splice modifier flags
+ *
+ * Will either move or copy pages (determined by @flags options) from
+ * the given pipe inode to the given file. The caller is responsible
+ * for acquiring i_mutex on both inodes.
+ *
+ */
+ssize_t
+generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
+				 loff_t *ppos, size_t len, unsigned int flags)
+{
+	struct address_space *mapping = out->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t ret;
+	int err;
+
+	err = remove_suid(out->f_dentry);
+	if (unlikely(err))
+		return err;
+
+	ret = __splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
+	if (ret > 0) {
+		*ppos += ret;
+
+		/*
+		 * If file or inode is SYNC and we actually wrote some data,
+		 * sync it.
+		 */
+		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+			err = generic_osync_inode(inode, mapping,
+						  OSYNC_METADATA|OSYNC_DATA);
+
+			if (err)
+				ret = err;
+		}
+	}
+
+	return ret;
+}
+
+EXPORT_SYMBOL(generic_file_splice_write_nolock);
+
 /**
  * generic_file_splice_write - splice data from a pipe to a file
  * @pipe:	pipe info
@@ -826,12 +887,21 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 			  loff_t *ppos, size_t len, unsigned int flags)
 {
 	struct address_space *mapping = out->f_mapping;
+	struct inode *inode = mapping->host;
 	ssize_t ret;
+	int err;
+
+	err = should_remove_suid(out->f_dentry);
+	if (unlikely(err)) {
+		mutex_lock(&inode->i_mutex);
+		err = __remove_suid(out->f_dentry, err);
+		mutex_unlock(&inode->i_mutex);
+		if (err)
+			return err;
+	}
 
 	ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
 	if (ret > 0) {
-		struct inode *inode = mapping->host;
-
 		*ppos += ret;
 
 		/*
@@ -839,8 +909,6 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 		 * sync it.
 		 */
 		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
-			int err;
-
 			mutex_lock(&inode->i_mutex);
 			err = generic_osync_inode(inode, mapping,
 						  OSYNC_METADATA|OSYNC_DATA);
@@ -1400,13 +1468,7 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 	 * grabbing by inode address. Otherwise two different processes
 	 * could deadlock (one doing tee from A -> B, the other from B -> A).
 	 */
-	if (ipipe->inode < opipe->inode) {
-		mutex_lock_nested(&ipipe->inode->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&opipe->inode->i_mutex, I_MUTEX_CHILD);
-	} else {
-		mutex_lock_nested(&opipe->inode->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&ipipe->inode->i_mutex, I_MUTEX_CHILD);
-	}
+	inode_double_lock(ipipe->inode, opipe->inode);
 
 	do {
 		if (!opipe->readers) {
@@ -1450,8 +1512,7 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 		i++;
 	} while (len);
 
-	mutex_unlock(&ipipe->inode->i_mutex);
-	mutex_unlock(&opipe->inode->i_mutex);
+	inode_double_unlock(ipipe->inode, opipe->inode);
 
 	/*
 	 * If we put data in the output pipe, wakeup any potential readers.
diff --git a/fs/stat.c b/fs/stat.c
index 3a44dcf97da2..bca07eb2003c 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -14,6 +14,7 @@
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/pagemap.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -32,7 +33,7 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
 	stat->ctime = inode->i_ctime;
 	stat->size = i_size_read(inode);
 	stat->blocks = inode->i_blocks;
-	stat->blksize = inode->i_blksize;
+	stat->blksize = (1 << inode->i_blkbits);
 }
 
 EXPORT_SYMBOL(generic_fillattr);
@@ -139,6 +140,8 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
 	memset(&tmp, 0, sizeof(struct __old_kernel_stat));
 	tmp.st_dev = old_encode_dev(stat->dev);
 	tmp.st_ino = stat->ino;
+	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
+		return -EOVERFLOW;
 	tmp.st_mode = stat->mode;
 	tmp.st_nlink = stat->nlink;
 	if (tmp.st_nlink != stat->nlink)
@@ -209,6 +212,8 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 	tmp.st_dev = new_encode_dev(stat->dev);
 #endif
 	tmp.st_ino = stat->ino;
+	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
+		return -EOVERFLOW;
 	tmp.st_mode = stat->mode;
 	tmp.st_nlink = stat->nlink;
 	if (tmp.st_nlink != stat->nlink)
@@ -346,6 +351,8 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
 	tmp.st_rdev = huge_encode_dev(stat->rdev);
 #endif
 	tmp.st_ino = stat->ino;
+	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
+		return -EOVERFLOW;
 #ifdef STAT64_HAS_BROKEN_ST_INO
 	tmp.__st_ino = stat->ino;
 #endif
diff --git a/fs/super.c b/fs/super.c
index 5c4c94d5495e..47e554c12e76 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -199,7 +199,7 @@ EXPORT_SYMBOL(deactivate_super);
  *	success, 0 if we had failed (superblock contents was already dead or
  *	dying when grab_super() had been called).
  */
-static int grab_super(struct super_block *s)
+static int grab_super(struct super_block *s) __releases(sb_lock)
 {
 	s->s_count++;
 	spin_unlock(&sb_lock);
@@ -220,6 +220,37 @@ static int grab_super(struct super_block *s)
 	return 0;
 }
 
+/*
+ * Write out and wait upon all dirty data associated with this
+ * superblock.  Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.  Requires a second blkdev
+ * flush by the caller to complete the operation.
+ */
+void __fsync_super(struct super_block *sb)
+{
+	sync_inodes_sb(sb, 0);
+	DQUOT_SYNC(sb);
+	lock_super(sb);
+	if (sb->s_dirt && sb->s_op->write_super)
+		sb->s_op->write_super(sb);
+	unlock_super(sb);
+	if (sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, 1);
+	sync_blockdev(sb->s_bdev);
+	sync_inodes_sb(sb, 1);
+}
+
+/*
+ * Write out and wait upon all dirty data associated with this
+ * superblock.  Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int fsync_super(struct super_block *sb)
+{
+	__fsync_super(sb);
+	return sync_blockdev(sb->s_bdev);
+}
+
 /**
  *	generic_shutdown_super	-	common helper for ->kill_sb()
  *	@sb: superblock to kill
@@ -229,17 +260,17 @@ static int grab_super(struct super_block *s)
  *	that need destruction out of superblock, call generic_shutdown_super()
  *	and release aforementioned objects.  Note: dentries and inodes _are_
  *	taken care of and do not need specific handling.
+ *
+ *	Upon calling this function, the filesystem may no longer alter or
+ *	rearrange the set of dentries belonging to this super_block, nor may it
+ *	change the attachments of dentries to inodes.
  */
 void generic_shutdown_super(struct super_block *sb)
 {
-	struct dentry *root = sb->s_root;
 	struct super_operations *sop = sb->s_op;
 
-	if (root) {
-		sb->s_root = NULL;
-		shrink_dcache_parent(root);
-		shrink_dcache_sb(sb);
-		dput(root);
+	if (sb->s_root) {
+		shrink_dcache_for_umount(sb);
 		fsync_super(sb);
 		lock_super(sb);
 		sb->s_flags &= ~MS_ACTIVE;
@@ -540,8 +571,10 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 {
 	int retval;
 	
+#ifdef CONFIG_BLOCK
 	if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev))
 		return -EACCES;
+#endif
 	if (flags & MS_RDONLY)
 		acct_auto_close(sb);
 	shrink_dcache_sb(sb);
@@ -661,6 +694,7 @@ void kill_litter_super(struct super_block *sb)
 
 EXPORT_SYMBOL(kill_litter_super);
 
+#ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
 {
 	s->s_bdev = data;
@@ -756,6 +790,7 @@ void kill_block_super(struct super_block *sb)
 }
 
 EXPORT_SYMBOL(kill_block_super);
+#endif
 
 int get_sb_nodev(struct file_system_type *fs_type,
 	int flags, void *data,
diff --git a/fs/sync.c b/fs/sync.c
index 955aef04da28..1de747b5ddb9 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -10,11 +10,124 @@
 #include <linux/syscalls.h>
 #include <linux/linkage.h>
 #include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
 
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
 
 /*
+ * sync everything.  Start out by waking pdflush, because that writes back
+ * all queues in parallel.
+ */
+static void do_sync(unsigned long wait)
+{
+	wakeup_pdflush(0);
+	sync_inodes(0);		/* All mappings, inodes and their blockdevs */
+	DQUOT_SYNC(NULL);
+	sync_supers();		/* Write the superblocks */
+	sync_filesystems(0);	/* Start syncing the filesystems */
+	sync_filesystems(wait);	/* Waitingly sync the filesystems */
+	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
+	if (!wait)
+		printk("Emergency Sync complete\n");
+	if (unlikely(laptop_mode))
+		laptop_sync_completion();
+}
+
+asmlinkage long sys_sync(void)
+{
+	do_sync(1);
+	return 0;
+}
+
+void emergency_sync(void)
+{
+	pdflush_operation(do_sync, 0);
+}
+
+/*
+ * Generic function to fsync a file.
+ *
+ * filp may be NULL if called via the msync of a vma.
+ */
+int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+{
+	struct inode * inode = dentry->d_inode;
+	struct super_block * sb;
+	int ret, err;
+
+	/* sync the inode to buffers */
+	ret = write_inode_now(inode, 0);
+
+	/* sync the superblock to buffers */
+	sb = inode->i_sb;
+	lock_super(sb);
+	if (sb->s_op->write_super)
+		sb->s_op->write_super(sb);
+	unlock_super(sb);
+
+	/* .. finally sync the buffers to disk */
+	err = sync_blockdev(sb->s_bdev);
+	if (!ret)
+		ret = err;
+	return ret;
+}
+
+long do_fsync(struct file *file, int datasync)
+{
+	int ret;
+	int err;
+	struct address_space *mapping = file->f_mapping;
+
+	if (!file->f_op || !file->f_op->fsync) {
+		/* Why?  We can still call filemap_fdatawrite */
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = filemap_fdatawrite(mapping);
+
+	/*
+	 * We need to protect against concurrent writers, which could cause
+	 * livelocks in fsync_buffers_list().
+	 */
+	mutex_lock(&mapping->host->i_mutex);
+	err = file->f_op->fsync(file, file->f_dentry, datasync);
+	if (!ret)
+		ret = err;
+	mutex_unlock(&mapping->host->i_mutex);
+	err = filemap_fdatawait(mapping);
+	if (!ret)
+		ret = err;
+out:
+	return ret;
+}
+
+static long __do_fsync(unsigned int fd, int datasync)
+{
+	struct file *file;
+	int ret = -EBADF;
+
+	file = fget(fd);
+	if (file) {
+		ret = do_fsync(file, datasync);
+		fput(file);
+	}
+	return ret;
+}
+
+asmlinkage long sys_fsync(unsigned int fd)
+{
+	return __do_fsync(fd, 0);
+}
+
+asmlinkage long sys_fdatasync(unsigned int fd)
+{
+	return __do_fsync(fd, 1);
+}
+
+/*
  * sys_sync_file_range() permits finely controlled syncing over a segment of
  * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
  * zero then sys_sync_file_range() will operate from offset out to EOF.
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 5f3d725d1125..3aa3434621ca 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -103,7 +103,7 @@ static int init_dir(struct inode * inode)
 	inode->i_fop = &sysfs_dir_operations;
 
 	/* directory inodes start off with i_nlink == 2 (for "." entry) */
-	inode->i_nlink++;
+	inc_nlink(inode);
 	return 0;
 }
 
@@ -137,7 +137,7 @@ static int create_dir(struct kobject * k, struct dentry * p,
 		if (!error) {
 			error = sysfs_create(*d, mode, init_dir);
 			if (!error) {
-				p->d_inode->i_nlink++;
+				inc_nlink(p->d_inode);
 				(*d)->d_op = &sysfs_dentry_ops;
 				d_rehash(*d);
 			}
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index cf3786625bfa..298303b5a716 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -157,8 +157,8 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 		if ((retval = fill_read_buffer(file->f_dentry,buffer)))
 			goto out;
 	}
-	pr_debug("%s: count = %d, ppos = %lld, buf = %s\n",
-		 __FUNCTION__,count,*ppos,buffer->page);
+	pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
+		 __FUNCTION__, count, *ppos, buffer->page);
 	retval = flush_read_buffer(buffer,buf,count,ppos);
 out:
 	up(&buffer->sem);
@@ -483,17 +483,12 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
 		    (victim->d_parent->d_inode == dir->d_inode)) {
 			victim->d_inode->i_mtime = CURRENT_TIME;
 			fsnotify_modify(victim);
-
-			/**
-			 * Drop reference from initial sysfs_get_dentry().
-			 */
-			dput(victim);
 			res = 0;
 		} else
 			d_drop(victim);
 		
 		/**
-		 * Drop the reference acquired from sysfs_get_dentry() above.
+		 * Drop the reference acquired from lookup_one_len() above.
 		 */
 		dput(victim);
 	}
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index fd7cd5f843d2..e79e38d52c00 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -125,7 +125,6 @@ struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent * sd)
 {
 	struct inode * inode = new_inode(sysfs_sb);
 	if (inode) {
-		inode->i_blksize = PAGE_CACHE_SIZE;
 		inode->i_blocks = 0;
 		inode->i_mapping->a_ops = &sysfs_aops;
 		inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 40190c489271..20551a1b8a56 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -49,7 +49,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
 		inode->i_op = &sysfs_dir_inode_operations;
 		inode->i_fop = &sysfs_dir_operations;
 		/* directory inodes start off with i_nlink == 2 (for "." entry) */
-		inode->i_nlink++;	
+		inc_nlink(inode);
 	} else {
 		pr_debug("sysfs: could not get root inode\n");
 		return -ENOMEM;
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index a59e303135fa..47a4b728f15b 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -21,8 +21,10 @@
  */
 const struct file_operations sysv_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.fsync		= sysv_sync_file,
 	.sendfile	= generic_file_sendfile,
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 9b585d1081c0..115ab0d6f4bc 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -170,7 +170,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
 	inode->i_uid = current->fsuid;
 	inode->i_ino = fs16_to_cpu(sbi, ino);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-	inode->i_blocks = inode->i_blksize = 0;
+	inode->i_blocks = 0;
 	memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data));
 	SYSV_I(inode)->i_dir_start_lookup = 0;
 	insert_inode_hash(inode);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 58b2d22142ba..d63c5e48b050 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -201,7 +201,7 @@ static void sysv_read_inode(struct inode *inode)
 	inode->i_ctime.tv_nsec = 0;
 	inode->i_atime.tv_nsec = 0;
 	inode->i_mtime.tv_nsec = 0;
-	inode->i_blocks = inode->i_blksize = 0;
+	inode->i_blocks = 0;
 
 	si = SYSV_I(inode);
 	for (block = 0; block < 10+1+1+1; block++)
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index b8a73f716fbe..f7c08db8e34c 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -250,7 +250,7 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
 		sysv_set_link(new_de, new_page, old_inode);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
-			new_inode->i_nlink--;
+			drop_nlink(new_inode);
 		inode_dec_link_count(new_inode);
 	} else {
 		if (dir_de) {
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 876639b93321..dc9e7dc07fb7 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -358,21 +358,15 @@ static int sysv_fill_super(struct super_block *sb, void *data, int silent)
 	unsigned long blocknr;
 	int size = 0, i;
 	
-	if (1024 != sizeof (struct xenix_super_block))
-		panic("Xenix FS: bad superblock size");
-	if (512 != sizeof (struct sysv4_super_block))
-		panic("SystemV FS: bad superblock size");
-	if (512 != sizeof (struct sysv2_super_block))
-		panic("SystemV FS: bad superblock size");
-	if (500 != sizeof (struct coh_super_block))
-		panic("Coherent FS: bad superblock size");
-	if (64 != sizeof (struct sysv_inode))
-		panic("sysv fs: bad inode size");
+	BUILD_BUG_ON(1024 != sizeof (struct xenix_super_block));
+	BUILD_BUG_ON(512 != sizeof (struct sysv4_super_block));
+	BUILD_BUG_ON(512 != sizeof (struct sysv2_super_block));
+	BUILD_BUG_ON(500 != sizeof (struct coh_super_block));
+	BUILD_BUG_ON(64 != sizeof (struct sysv_inode));
 
-	sbi = kmalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
+	sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
-	memset(sbi, 0, sizeof(struct sysv_sb_info));
 
 	sbi->s_sb = sb;
 	sbi->s_block_base = 0;
@@ -453,10 +447,9 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
 	if (64 != sizeof (struct sysv_inode))
 		panic("sysv fs: bad i-node size");
 
-	sbi = kmalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
+	sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
-	memset(sbi, 0, sizeof(struct sysv_sb_info));
 
 	sbi->s_sb = sb;
 	sbi->s_block_base = 0;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index a59e5f33daf6..7aedd552cba1 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -103,19 +103,21 @@ const struct address_space_operations udf_adinicb_aops = {
 	.commit_write		= udf_adinicb_commit_write,
 };
 
-static ssize_t udf_file_write(struct file * file, const char __user * buf,
-	size_t count, loff_t *ppos)
+static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			      unsigned long nr_segs, loff_t ppos)
 {
 	ssize_t retval;
+	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_dentry->d_inode;
 	int err, pos;
+	size_t count = iocb->ki_left;
 
 	if (UDF_I_ALLOCTYPE(inode) == ICBTAG_FLAG_AD_IN_ICB)
 	{
 		if (file->f_flags & O_APPEND)
 			pos = inode->i_size;
 		else
-			pos = *ppos;
+			pos = ppos;
 
 		if (inode->i_sb->s_blocksize < (udf_file_entry_alloc_offset(inode) +
 			pos + count))
@@ -136,7 +138,7 @@ static ssize_t udf_file_write(struct file * file, const char __user * buf,
 		}
 	}
 
-	retval = generic_file_write(file, buf, count, ppos);
+	retval = generic_file_aio_write(iocb, iov, nr_segs, ppos);
 
 	if (retval > 0)
 		mark_inode_dirty(inode);
@@ -249,11 +251,13 @@ static int udf_release_file(struct inode * inode, struct file * filp)
 }
 
 const struct file_operations udf_file_operations = {
-	.read			= generic_file_read,
+	.read			= do_sync_read,
+	.aio_read		= generic_file_aio_read,
 	.ioctl			= udf_ioctl,
 	.open			= generic_file_open,
 	.mmap			= generic_file_mmap,
-	.write			= udf_file_write,
+	.write			= do_sync_write,
+	.aio_write		= udf_file_aio_write,
 	.release		= udf_release_file,
 	.fsync			= udf_fsync_file,
 	.sendfile		= generic_file_sendfile,
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 33323473e3c4..8206983f2ebf 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -121,7 +121,6 @@ struct inode * udf_new_inode (struct inode *dir, int mode, int * err)
 	UDF_I_LOCATION(inode).logicalBlockNum = block;
 	UDF_I_LOCATION(inode).partitionReferenceNum = UDF_I_LOCATION(dir).partitionReferenceNum;
 	inode->i_ino = udf_get_lb_pblock(sb, UDF_I_LOCATION(inode), 0);
-	inode->i_blksize = PAGE_SIZE;
 	inode->i_blocks = 0;
 	UDF_I_LENEATTR(inode) = 0;
 	UDF_I_LENALLOC(inode) = 0;
@@ -130,14 +129,12 @@ struct inode * udf_new_inode (struct inode *dir, int mode, int * err)
 	{
 		UDF_I_EFE(inode) = 1;
 		UDF_UPDATE_UDFREV(inode->i_sb, UDF_VERS_USE_EXTENDED_FE);
-		UDF_I_DATA(inode) = kmalloc(inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry), GFP_KERNEL);
-		memset(UDF_I_DATA(inode), 0x00, inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry));
+		UDF_I_DATA(inode) = kzalloc(inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry), GFP_KERNEL);
 	}
 	else
 	{
 		UDF_I_EFE(inode) = 0;
-		UDF_I_DATA(inode) = kmalloc(inode->i_sb->s_blocksize - sizeof(struct fileEntry), GFP_KERNEL);
-		memset(UDF_I_DATA(inode), 0x00, inode->i_sb->s_blocksize - sizeof(struct fileEntry));
+		UDF_I_DATA(inode) = kzalloc(inode->i_sb->s_blocksize - sizeof(struct fileEntry), GFP_KERNEL);
 	}
 	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
 		UDF_I_ALLOCTYPE(inode) = ICBTAG_FLAG_AD_IN_ICB;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 605f5111b6d8..ae21a0e59e95 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -916,8 +916,6 @@ __udf_read_inode(struct inode *inode)
 	 *      i_nlink = 1
 	 *      i_op = NULL;
 	 */
-	inode->i_blksize = PAGE_SIZE;
-
 	bh = udf_read_ptagged(inode->i_sb, UDF_I_LOCATION(inode), 0, &ident);
 
 	if (!bh)
@@ -1167,7 +1165,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 			inode->i_op = &udf_dir_inode_operations;
 			inode->i_fop = &udf_dir_operations;
 			inode->i_mode |= S_IFDIR;
-			inode->i_nlink ++;
+			inc_nlink(inode);
 			break;
 		}
 		case ICBTAG_FILE_TYPE_REALTIME:
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index ab9a7629d23e..73163325e5ec 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -762,7 +762,7 @@ static int udf_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 		cpu_to_le32(UDF_I_UNIQUE(inode) & 0x00000000FFFFFFFFUL);
 	cfi.fileCharacteristics |= FID_FILE_CHAR_DIRECTORY;
 	udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
-	dir->i_nlink++;
+	inc_nlink(dir);
 	mark_inode_dirty(dir);
 	d_instantiate(dentry, inode);
 	if (fibh.sbh != fibh.ebh)
@@ -876,10 +876,9 @@ static int udf_rmdir(struct inode * dir, struct dentry * dentry)
 		udf_warning(inode->i_sb, "udf_rmdir",
 			"empty directory has nlink != 2 (%d)",
 			inode->i_nlink);
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	inode->i_size = 0;
-	mark_inode_dirty(inode);
-	dir->i_nlink --;
+	inode_dec_link_count(inode);
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb);
 	mark_inode_dirty(dir);
 
@@ -923,8 +922,7 @@ static int udf_unlink(struct inode * dir, struct dentry * dentry)
 		goto end_unlink;
 	dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb);
 	mark_inode_dirty(dir);
-	inode->i_nlink--;
-	mark_inode_dirty(inode);
+	inode_dec_link_count(inode);
 	inode->i_ctime = dir->i_ctime;
 	retval = 0;
 
@@ -1101,8 +1099,7 @@ out:
 	return err;
 
 out_no_entry:
-	inode->i_nlink--;
-	mark_inode_dirty(inode);
+	inode_dec_link_count(inode);
 	iput(inode);
 	goto out;
 }
@@ -1150,7 +1147,7 @@ static int udf_link(struct dentry * old_dentry, struct inode * dir,
 	if (fibh.sbh != fibh.ebh)
 		udf_release_data(fibh.ebh);
 	udf_release_data(fibh.sbh);
-	inode->i_nlink ++;
+	inc_nlink(inode);
 	inode->i_ctime = current_fs_time(inode->i_sb);
 	mark_inode_dirty(inode);
 	atomic_inc(&inode->i_count);
@@ -1261,9 +1258,8 @@ static int udf_rename (struct inode * old_dir, struct dentry * old_dentry,
 
 	if (new_inode)
 	{
-		new_inode->i_nlink--;
 		new_inode->i_ctime = current_fs_time(new_inode->i_sb);
-		mark_inode_dirty(new_inode);
+		inode_dec_link_count(new_inode);
 	}
 	old_dir->i_ctime = old_dir->i_mtime = current_fs_time(old_dir->i_sb);
 	mark_inode_dirty(old_dir);
@@ -1279,16 +1275,14 @@ static int udf_rename (struct inode * old_dir, struct dentry * old_dentry,
 		}
 		else
 			mark_buffer_dirty_inode(dir_bh, old_inode);
-		old_dir->i_nlink --;
-		mark_inode_dirty(old_dir);
+		inode_dec_link_count(old_dir);
 		if (new_inode)
 		{
-			new_inode->i_nlink --;
-			mark_inode_dirty(new_inode);
+			inode_dec_link_count(new_inode);
 		}
 		else
 		{
-			new_dir->i_nlink ++;
+			inc_nlink(new_dir);
 			mark_inode_dirty(new_dir);
 		}
 	}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index fcce1a21a51b..1aea6a4f9a4a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -156,8 +156,7 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(udf_inode_cachep))
-		printk(KERN_INFO "udf_inode_cache: not all structures were freed\n");
+	kmem_cache_destroy(udf_inode_cachep);
 }
 
 /* Superblock operations */
@@ -1622,6 +1621,11 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 		goto error_out;
 	}
 
+	if (UDF_SB_PARTFLAGS(sb, UDF_SB_PARTITION(sb)) & UDF_PART_FLAG_READ_ONLY) {
+		printk("UDF-fs: Partition marked readonly; forcing readonly mount\n");
+		sb->s_flags |= MS_RDONLY;
+	}
+
 	if ( udf_find_fileset(sb, &fileset, &rootdir) )
 	{
 		printk("UDF-fs: No fileset found\n");
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index a9c6e5f04fae..1e096323bad4 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -53,8 +53,10 @@ static int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
  
 const struct file_operations ufs_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= generic_file_read,
-	.write		= generic_file_write,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.open           = generic_file_open,
 	.fsync		= ufs_sync_file,
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 9501dcd3b213..2ad1259c6eca 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -255,7 +255,6 @@ cg_found:
 		inode->i_gid = current->fsgid;
 
 	inode->i_ino = cg * uspi->s_ipg + bit;
-	inode->i_blksize = PAGE_SIZE;	/* This is the optimal IO size (for stat), not the fs block size */
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	ufsi->i_flags = UFS_I(dir)->i_flags;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 30c6e8a9446c..ee1eaa6f4ec2 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -741,7 +741,6 @@ void ufs_read_inode(struct inode * inode)
 		ufs1_read_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino));
 	}
 
-	inode->i_blksize = PAGE_SIZE;/*This is the optimal IO size (for stat)*/
 	inode->i_version++;
 	ufsi->i_lastfrag =
 		(inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index d344b411e261..e84c0ecf0730 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -308,7 +308,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		ufs_set_link(new_dir, new_de, new_page, old_inode);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
-			new_inode->i_nlink--;
+			drop_nlink(new_inode);
 		inode_dec_link_count(new_inode);
 	} else {
 		if (dir_de) {
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 992ee0b87cc3..ec79e3091d1b 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -611,11 +611,10 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	
 	UFSD("ENTER\n");
 		
-	sbi = kmalloc(sizeof(struct ufs_sb_info), GFP_KERNEL);
+	sbi = kzalloc(sizeof(struct ufs_sb_info), GFP_KERNEL);
 	if (!sbi)
 		goto failed_nomem;
 	sb->s_fs_info = sbi;
-	memset(sbi, 0, sizeof(struct ufs_sb_info));
 
 	UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY));
 	
@@ -1245,8 +1244,7 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
-	if (kmem_cache_destroy(ufs_inode_cachep))
-		printk(KERN_INFO "ufs_inode_cache: not all structures were freed\n");
+	kmem_cache_destroy(ufs_inode_cachep);
 }
 
 #ifdef CONFIG_QUOTA
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 22f820a9b15c..17437574f79c 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -184,14 +184,13 @@ void _ubh_memcpyubh_(struct ufs_sb_private_info * uspi,
 dev_t
 ufs_get_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi)
 {
-	__fs32 fs32;
+	__u32 fs32;
 	dev_t dev;
 
 	if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
-		fs32 = ufsi->i_u1.i_data[1];
+		fs32 = fs32_to_cpu(sb, ufsi->i_u1.i_data[1]);
 	else
-		fs32 = ufsi->i_u1.i_data[0];
-	fs32 = fs32_to_cpu(sb, fs32);
+		fs32 = fs32_to_cpu(sb, ufsi->i_u1.i_data[0]);
 	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
 	case UFS_ST_SUNx86:
 	case UFS_ST_SUN:
@@ -212,7 +211,7 @@ ufs_get_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi)
 void
 ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev)
 {
-	__fs32 fs32;
+	__u32 fs32;
 
 	switch (UFS_SB(sb)->s_flags & UFS_ST_MASK) {
 	case UFS_ST_SUNx86:
@@ -227,11 +226,10 @@ ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev
 		fs32 = old_encode_dev(dev);
 		break;
 	}
-	fs32 = cpu_to_fs32(sb, fs32);
 	if ((UFS_SB(sb)->s_flags & UFS_ST_MASK) == UFS_ST_SUNx86)
-		ufsi->i_u1.i_data[1] = fs32;
+		ufsi->i_u1.i_data[1] = cpu_to_fs32(sb, fs32);
 	else
-		ufsi->i_u1.i_data[0] = fs32;
+		ufsi->i_u1.i_data[0] = cpu_to_fs32(sb, fs32);
 }
 
 /**
diff --git a/fs/utimes.c b/fs/utimes.c
new file mode 100644
index 000000000000..1bcd852fc4a9
--- /dev/null
+++ b/fs/utimes.c
@@ -0,0 +1,137 @@
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/linkage.h>
+#include <linux/namei.h>
+#include <linux/utime.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+
+#ifdef __ARCH_WANT_SYS_UTIME
+
+/*
+ * sys_utime() can be implemented in user-level using sys_utimes().
+ * Is this for backwards compatibility?  If so, why not move it
+ * into the appropriate arch directory (for those architectures that
+ * need it).
+ */
+
+/* If times==NULL, set access and modification to current time,
+ * must be owner or have write permission.
+ * Else, update from *times, must be owner or super user.
+ */
+asmlinkage long sys_utime(char __user * filename, struct utimbuf __user * times)
+{
+	int error;
+	struct nameidata nd;
+	struct inode * inode;
+	struct iattr newattrs;
+
+	error = user_path_walk(filename, &nd);
+	if (error)
+		goto out;
+	inode = nd.dentry->d_inode;
+
+	error = -EROFS;
+	if (IS_RDONLY(inode))
+		goto dput_and_out;
+
+	/* Don't worry, the checks are done in inode_change_ok() */
+	newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
+	if (times) {
+		error = -EPERM;
+		if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+			goto dput_and_out;
+
+		error = get_user(newattrs.ia_atime.tv_sec, &times->actime);
+		newattrs.ia_atime.tv_nsec = 0;
+		if (!error)
+			error = get_user(newattrs.ia_mtime.tv_sec, &times->modtime);
+		newattrs.ia_mtime.tv_nsec = 0;
+		if (error)
+			goto dput_and_out;
+
+		newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
+	} else {
+                error = -EACCES;
+                if (IS_IMMUTABLE(inode))
+                        goto dput_and_out;
+
+		if (current->fsuid != inode->i_uid &&
+		    (error = vfs_permission(&nd, MAY_WRITE)) != 0)
+			goto dput_and_out;
+	}
+	mutex_lock(&inode->i_mutex);
+	error = notify_change(nd.dentry, &newattrs);
+	mutex_unlock(&inode->i_mutex);
+dput_and_out:
+	path_release(&nd);
+out:
+	return error;
+}
+
+#endif
+
+/* If times==NULL, set access and modification to current time,
+ * must be owner or have write permission.
+ * Else, update from *times, must be owner or super user.
+ */
+long do_utimes(int dfd, char __user *filename, struct timeval *times)
+{
+	int error;
+	struct nameidata nd;
+	struct inode * inode;
+	struct iattr newattrs;
+
+	error = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd);
+
+	if (error)
+		goto out;
+	inode = nd.dentry->d_inode;
+
+	error = -EROFS;
+	if (IS_RDONLY(inode))
+		goto dput_and_out;
+
+	/* Don't worry, the checks are done in inode_change_ok() */
+	newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
+	if (times) {
+		error = -EPERM;
+                if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                        goto dput_and_out;
+
+		newattrs.ia_atime.tv_sec = times[0].tv_sec;
+		newattrs.ia_atime.tv_nsec = times[0].tv_usec * 1000;
+		newattrs.ia_mtime.tv_sec = times[1].tv_sec;
+		newattrs.ia_mtime.tv_nsec = times[1].tv_usec * 1000;
+		newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
+	} else {
+		error = -EACCES;
+                if (IS_IMMUTABLE(inode))
+                        goto dput_and_out;
+
+		if (current->fsuid != inode->i_uid &&
+		    (error = vfs_permission(&nd, MAY_WRITE)) != 0)
+			goto dput_and_out;
+	}
+	mutex_lock(&inode->i_mutex);
+	error = notify_change(nd.dentry, &newattrs);
+	mutex_unlock(&inode->i_mutex);
+dput_and_out:
+	path_release(&nd);
+out:
+	return error;
+}
+
+asmlinkage long sys_futimesat(int dfd, char __user *filename, struct timeval __user *utimes)
+{
+	struct timeval times[2];
+
+	if (utimes && copy_from_user(&times, utimes, sizeof(times)))
+		return -EFAULT;
+	return do_utimes(dfd, filename, utimes ? times : NULL);
+}
+
+asmlinkage long sys_utimes(char __user *filename, struct timeval __user *utimes)
+{
+	return sys_futimesat(AT_FDCWD, filename, utimes);
+}
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index 9a8f48bae956..edb711ff7b05 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -782,9 +782,9 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
 	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
 	if (err)
 		goto out;
-	dir->i_nlink--;
+	drop_nlink(dir);
 
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
 	fat_detach(inode);
 out:
@@ -808,7 +808,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
 	err = fat_remove_entries(dir, &sinfo);	/* and releases bh */
 	if (err)
 		goto out;
-	inode->i_nlink = 0;
+	clear_nlink(inode);
 	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
 	fat_detach(inode);
 out:
@@ -837,7 +837,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (err)
 		goto out_free;
 	dir->i_version++;
-	dir->i_nlink++;
+	inc_nlink(dir);
 
 	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
 	brelse(sinfo.bh);
@@ -930,9 +930,9 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 			if (err)
 				goto error_dotdot;
 		}
-		old_dir->i_nlink--;
+		drop_nlink(old_dir);
 		if (!new_inode)
- 			new_dir->i_nlink++;
+ 			inc_nlink(new_dir);
 	}
 
 	err = fat_remove_entries(old_dir, &old_sinfo);	/* and releases bh */
@@ -947,10 +947,9 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 		mark_inode_dirty(old_dir);
 
 	if (new_inode) {
+		drop_nlink(new_inode);
 		if (is_dir)
-			new_inode->i_nlink -= 2;
-		else
-			new_inode->i_nlink--;
+			drop_nlink(new_inode);
 		new_inode->i_ctime = ts;
 	}
 out:
diff --git a/fs/xattr.c b/fs/xattr.c
index c32f15b5f60f..395635100f77 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -135,6 +135,26 @@ vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size)
 }
 EXPORT_SYMBOL_GPL(vfs_getxattr);
 
+ssize_t
+vfs_listxattr(struct dentry *d, char *list, size_t size)
+{
+	ssize_t error;
+
+	error = security_inode_listxattr(d);
+	if (error)
+		return error;
+	error = -EOPNOTSUPP;
+	if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
+		error = d->d_inode->i_op->listxattr(d, list, size);
+	} else {
+		error = security_inode_listsecurity(d->d_inode, list, size);
+		if (size && error > size)
+			error = -ERANGE;
+	}
+	return error;
+}
+EXPORT_SYMBOL_GPL(vfs_listxattr);
+
 int
 vfs_removexattr(struct dentry *dentry, char *name)
 {
@@ -346,17 +366,7 @@ listxattr(struct dentry *d, char __user *list, size_t size)
 			return -ENOMEM;
 	}
 
-	error = security_inode_listxattr(d);
-	if (error)
-		goto out;
-	error = -EOPNOTSUPP;
-	if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
-		error = d->d_inode->i_op->listxattr(d, klist, size);
-	} else {
-		error = security_inode_listsecurity(d->d_inode, klist, size);
-		if (size && error > size)
-			error = -ERANGE;
-	}
+	error = vfs_listxattr(d, klist, size);
 	if (error > 0) {
 		if (size && copy_to_user(list, klist, error))
 			error = -EFAULT;
@@ -365,7 +375,6 @@ listxattr(struct dentry *d, char __user *list, size_t size)
 		   than XATTR_LIST_MAX bytes. Not possible. */
 		error = -E2BIG;
 	}
-out:
 	kfree(klist);
 	return error;
 }
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 26b364c9d62c..35115bca036e 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,5 +1,6 @@
 config XFS_FS
 	tristate "XFS filesystem support"
+	depends on BLOCK
 	help
 	  XFS is a high performance journaling filesystem which originated
 	  on the SGI IRIX platform.  It is completely multi-threaded, can
diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6
index 9e7f85986d0d..291948d5085a 100644
--- a/fs/xfs/Makefile-linux-2.6
+++ b/fs/xfs/Makefile-linux-2.6
@@ -30,7 +30,6 @@ ifeq ($(CONFIG_XFS_TRACE),y)
 	EXTRA_CFLAGS += -DXFS_BLI_TRACE
 	EXTRA_CFLAGS += -DXFS_BMAP_TRACE
 	EXTRA_CFLAGS += -DXFS_BMBT_TRACE
-	EXTRA_CFLAGS += -DXFS_DIR_TRACE
 	EXTRA_CFLAGS += -DXFS_DIR2_TRACE
 	EXTRA_CFLAGS += -DXFS_DQUOT_TRACE
 	EXTRA_CFLAGS += -DXFS_ILOCK_TRACE
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index aba7fcf881a2..004baf600611 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -21,6 +21,7 @@
 #include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include "time.h"
 #include "kmem.h"
 
@@ -34,6 +35,14 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
 	gfp_t	lflags = kmem_flags_convert(flags);
 	void	*ptr;
 
+#ifdef DEBUG
+	if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) {
+		printk(KERN_WARNING "Large %s attempt, size=%ld\n",
+			__FUNCTION__, (long)size);
+		dump_stack();
+	}
+#endif
+
 	do {
 		if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS)
 			ptr = kmalloc(size, lflags);
@@ -45,7 +54,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
 			printk(KERN_ERR "XFS: possible memory allocation "
 					"deadlock in %s (mode:0x%x)\n",
 					__FUNCTION__, lflags);
-		blk_congestion_wait(WRITE, HZ/50);
+		congestion_wait(WRITE, HZ/50);
 	} while (1);
 }
 
@@ -60,6 +69,27 @@ kmem_zalloc(size_t size, unsigned int __nocast flags)
 	return ptr;
 }
 
+void *
+kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize,
+		   unsigned int __nocast flags)
+{
+	void		*ptr;
+	size_t		kmsize = maxsize;
+	unsigned int	kmflags = (flags & ~KM_SLEEP) | KM_NOSLEEP;
+
+	while (!(ptr = kmem_zalloc(kmsize, kmflags))) {
+		if ((kmsize <= minsize) && (flags & KM_NOSLEEP))
+			break;
+		if ((kmsize >>= 1) <= minsize) {
+			kmsize = minsize;
+			kmflags = flags;
+		}
+	}
+	if (ptr)
+		*size = kmsize;
+	return ptr;
+}
+
 void
 kmem_free(void *ptr, size_t size)
 {
@@ -102,7 +132,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
 			printk(KERN_ERR "XFS: possible memory allocation "
 					"deadlock in %s (mode:0x%x)\n",
 					__FUNCTION__, lflags);
-		blk_congestion_wait(WRITE, HZ/50);
+		congestion_wait(WRITE, HZ/50);
 	} while (1);
 }
 
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 939bd84bc7ee..9ebabdf7829c 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -30,6 +30,7 @@
 #define KM_NOSLEEP	0x0002u
 #define KM_NOFS		0x0004u
 #define KM_MAYFAIL	0x0008u
+#define KM_LARGE	0x0010u
 
 /*
  * We use a special process flag to avoid recursive callbacks into
@@ -41,7 +42,7 @@ kmem_flags_convert(unsigned int __nocast flags)
 {
 	gfp_t	lflags;
 
-	BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL));
+	BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_LARGE));
 
 	if (flags & KM_NOSLEEP) {
 		lflags = GFP_ATOMIC | __GFP_NOWARN;
@@ -54,8 +55,9 @@ kmem_flags_convert(unsigned int __nocast flags)
 }
 
 extern void *kmem_alloc(size_t, unsigned int __nocast);
-extern void *kmem_realloc(void *, size_t, size_t, unsigned int __nocast);
 extern void *kmem_zalloc(size_t, unsigned int __nocast);
+extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast);
+extern void *kmem_realloc(void *, size_t, size_t, unsigned int __nocast);
 extern void  kmem_free(void *, size_t);
 
 /*
@@ -91,8 +93,8 @@ kmem_zone_free(kmem_zone_t *zone, void *ptr)
 static inline void
 kmem_zone_destroy(kmem_zone_t *zone)
 {
-	if (zone && kmem_cache_destroy(zone))
-		BUG();
+	if (zone)
+		kmem_cache_destroy(zone);
 }
 
 extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
index b25090094cca..2009e6d922ce 100644
--- a/fs/xfs/linux-2.6/sema.h
+++ b/fs/xfs/linux-2.6/sema.h
@@ -29,8 +29,6 @@
 
 typedef struct semaphore sema_t;
 
-#define init_sema(sp, val, c, d)	sema_init(sp, val)
-#define initsema(sp, val)		sema_init(sp, val)
 #define initnsema(sp, val, name)	sema_init(sp, val)
 #define psema(sp, b)			down(sp)
 #define vsema(sp)			up(sp)
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
index 9a8ad481b008..351a8f454bd1 100644
--- a/fs/xfs/linux-2.6/sv.h
+++ b/fs/xfs/linux-2.6/sv.h
@@ -53,8 +53,6 @@ static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
 	remove_wait_queue(&sv->waiters, &wait);
 }
 
-#define init_sv(sv,type,name,flag) \
-	init_waitqueue_head(&(sv)->waiters)
 #define sv_init(sv,flag,name) \
 	init_waitqueue_head(&(sv)->waiters)
 #define sv_destroy(sv) \
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 34dcb43a7837..09360cf1e1f2 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -71,7 +71,7 @@ xfs_page_trace(
 	int		tag,
 	struct inode	*inode,
 	struct page	*page,
-	int		mask)
+	unsigned long	pgoff)
 {
 	xfs_inode_t	*ip;
 	bhv_vnode_t	*vp = vn_from_inode(inode);
@@ -91,7 +91,7 @@ xfs_page_trace(
 		(void *)ip,
 		(void *)inode,
 		(void *)page,
-		(void *)((unsigned long)mask),
+		(void *)pgoff,
 		(void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
 		(void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
 		(void *)((unsigned long)((isize >> 32) & 0xffffffff)),
@@ -105,7 +105,7 @@ xfs_page_trace(
 		(void *)NULL);
 }
 #else
-#define xfs_page_trace(tag, inode, page, mask)
+#define xfs_page_trace(tag, inode, page, pgoff)
 #endif
 
 /*
@@ -1197,7 +1197,7 @@ xfs_vm_releasepage(
 		.nr_to_write = 1,
 	};
 
-	xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, gfp_mask);
+	xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, 0);
 
 	if (!page_has_buffers(page))
 		return 0;
@@ -1356,7 +1356,6 @@ xfs_end_io_direct(
 		ioend->io_size = size;
 		xfs_finish_ioend(ioend);
 	} else {
-		ASSERT(size >= 0);
 		xfs_destroy_ioend(ioend);
 	}
 
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 2af528dcfb04..db5f5a3608ca 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -30,6 +30,7 @@
 #include <linux/hash.h>
 #include <linux/kthread.h>
 #include <linux/migrate.h>
+#include <linux/backing-dev.h>
 #include "xfs_linux.h"
 
 STATIC kmem_zone_t *xfs_buf_zone;
@@ -318,8 +319,12 @@ xfs_buf_free(
 		if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
 			free_address(bp->b_addr - bp->b_offset);
 
-		for (i = 0; i < bp->b_page_count; i++)
-			page_cache_release(bp->b_pages[i]);
+		for (i = 0; i < bp->b_page_count; i++) {
+			struct page	*page = bp->b_pages[i];
+
+			ASSERT(!PagePrivate(page));
+			page_cache_release(page);
+		}
 		_xfs_buf_free_pages(bp);
 	} else if (bp->b_flags & _XBF_KMEM_ALLOC) {
 		 /*
@@ -391,7 +396,7 @@ _xfs_buf_lookup_pages(
 
 			XFS_STATS_INC(xb_page_retries);
 			xfsbufd_wakeup(0, gfp_mask);
-			blk_congestion_wait(WRITE, HZ/50);
+			congestion_wait(WRITE, HZ/50);
 			goto retry;
 		}
 
@@ -400,6 +405,7 @@ _xfs_buf_lookup_pages(
 		nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
 		size -= nbytes;
 
+		ASSERT(!PagePrivate(page));
 		if (!PageUptodate(page)) {
 			page_count--;
 			if (blocksize >= PAGE_CACHE_SIZE) {
@@ -768,7 +774,7 @@ xfs_buf_get_noaddr(
 	_xfs_buf_initialize(bp, target, 0, len, 0);
 
  try_again:
-	data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL);
+	data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL | KM_LARGE);
 	if (unlikely(data == NULL))
 		goto fail_free_buf;
 
@@ -1117,10 +1123,10 @@ xfs_buf_bio_end_io(
 	do {
 		struct page	*page = bvec->bv_page;
 
+		ASSERT(!PagePrivate(page));
 		if (unlikely(bp->b_error)) {
 			if (bp->b_flags & XBF_READ)
 				ClearPageUptodate(page);
-			SetPageError(page);
 		} else if (blocksize >= PAGE_CACHE_SIZE) {
 			SetPageUptodate(page);
 		} else if (!PagePrivate(page) &&
@@ -1156,16 +1162,16 @@ _xfs_buf_ioapply(
 	total_nr_pages = bp->b_page_count;
 	map_i = 0;
 
-	if (bp->b_flags & _XBF_RUN_QUEUES) {
-		bp->b_flags &= ~_XBF_RUN_QUEUES;
-		rw = (bp->b_flags & XBF_READ) ? READ_SYNC : WRITE_SYNC;
-	} else {
-		rw = (bp->b_flags & XBF_READ) ? READ : WRITE;
-	}
-
 	if (bp->b_flags & XBF_ORDERED) {
 		ASSERT(!(bp->b_flags & XBF_READ));
 		rw = WRITE_BARRIER;
+	} else if (bp->b_flags & _XBF_RUN_QUEUES) {
+		ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
+		bp->b_flags &= ~_XBF_RUN_QUEUES;
+		rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC;
+	} else {
+		rw = (bp->b_flags & XBF_WRITE) ? WRITE :
+		     (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
 	}
 
 	/* Special code path for reading a sub page size buffer in --
@@ -1681,6 +1687,7 @@ xfsbufd(
 	xfs_buf_t		*bp, *n;
 	struct list_head	*dwq = &target->bt_delwrite_queue;
 	spinlock_t		*dwlk = &target->bt_delwrite_lock;
+	int			count;
 
 	current->flags |= PF_MEMALLOC;
 
@@ -1696,6 +1703,7 @@ xfsbufd(
 		schedule_timeout_interruptible(
 			xfs_buf_timer_centisecs * msecs_to_jiffies(10));
 
+		count = 0;
 		age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
 		spin_lock(dwlk);
 		list_for_each_entry_safe(bp, n, dwq, b_list) {
@@ -1711,9 +1719,11 @@ xfsbufd(
 					break;
 				}
 
-				bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
+				bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q|
+						 _XBF_RUN_QUEUES);
 				bp->b_flags |= XBF_WRITE;
-				list_move(&bp->b_list, &tmp);
+				list_move_tail(&bp->b_list, &tmp);
+				count++;
 			}
 		}
 		spin_unlock(dwlk);
@@ -1724,12 +1734,12 @@ xfsbufd(
 
 			list_del_init(&bp->b_list);
 			xfs_buf_iostrategy(bp);
-
-			blk_run_address_space(target->bt_mapping);
 		}
 
 		if (as_list_len > 0)
 			purge_addresses();
+		if (count)
+			blk_run_address_space(target->bt_mapping);
 
 		clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
 	} while (!kthread_should_stop());
@@ -1767,7 +1777,7 @@ xfs_flush_buftarg(
 			continue;
 		}
 
-		list_move(&bp->b_list, &tmp);
+		list_move_tail(&bp->b_list, &tmp);
 	}
 	spin_unlock(dwlk);
 
@@ -1776,7 +1786,7 @@ xfs_flush_buftarg(
 	 */
 	list_for_each_entry_safe(bp, n, &tmp, b_list) {
 		xfs_buf_lock(bp);
-		bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
+		bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q|_XBF_RUN_QUEUES);
 		bp->b_flags |= XBF_WRITE;
 		if (wait)
 			bp->b_flags &= ~XBF_ASYNC;
@@ -1786,6 +1796,9 @@ xfs_flush_buftarg(
 		xfs_buf_iostrategy(bp);
 	}
 
+	if (wait)
+		blk_run_address_space(target->bt_mapping);
+
 	/*
 	 * Remaining list items must be flushed before returning
 	 */
@@ -1797,9 +1810,6 @@ xfs_flush_buftarg(
 		xfs_buf_relse(bp);
 	}
 
-	if (wait)
-		blk_run_address_space(target->bt_mapping);
-
 	return pincount;
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 7858703ed84c..9dd235cb0107 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -298,11 +298,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #define XFS_BUF_UNWRITE(bp)	((bp)->b_flags &= ~XBF_WRITE)
 #define XFS_BUF_ISWRITE(bp)	((bp)->b_flags & XBF_WRITE)
 
-#define XFS_BUF_ISUNINITIAL(bp)	(0)
-#define XFS_BUF_UNUNINITIAL(bp)	(0)
-
-#define XFS_BUF_BP_ISMAPPED(bp)	(1)
-
 #define XFS_BUF_IODONE_FUNC(bp)			((bp)->b_iodone)
 #define XFS_BUF_SET_IODONE_FUNC(bp, func)	((bp)->b_iodone = (func))
 #define XFS_BUF_CLR_IODONE_FUNC(bp)		((bp)->b_iodone = NULL)
@@ -393,8 +388,6 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
 	return error;
 }
 
-#define XFS_bdwrite(bp)		xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC)
-
 static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp)
 {
 	bp->b_strat = xfs_bdstrat_cb;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 3d4f6dff2113..d93d8dd1958d 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -49,50 +49,49 @@ static struct vm_operations_struct xfs_dmapi_file_vm_ops;
 STATIC inline ssize_t
 __xfs_file_read(
 	struct kiocb		*iocb,
-	char			__user *buf,
+	const struct iovec	*iov,
+	unsigned long		nr_segs,
 	int			ioflags,
-	size_t			count,
 	loff_t			pos)
 {
-	struct iovec		iov = {buf, count};
 	struct file		*file = iocb->ki_filp;
 	bhv_vnode_t		*vp = vn_from_inode(file->f_dentry->d_inode);
 
 	BUG_ON(iocb->ki_pos != pos);
 	if (unlikely(file->f_flags & O_DIRECT))
 		ioflags |= IO_ISDIRECT;
-	return bhv_vop_read(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL);
+	return bhv_vop_read(vp, iocb, iov, nr_segs, &iocb->ki_pos,
+				ioflags, NULL);
 }
 
 STATIC ssize_t
 xfs_file_aio_read(
 	struct kiocb		*iocb,
-	char			__user *buf,
-	size_t			count,
+	const struct iovec	*iov,
+	unsigned long		nr_segs,
 	loff_t			pos)
 {
-	return __xfs_file_read(iocb, buf, IO_ISAIO, count, pos);
+	return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO, pos);
 }
 
 STATIC ssize_t
 xfs_file_aio_read_invis(
 	struct kiocb		*iocb,
-	char			__user *buf,
-	size_t			count,
+	const struct iovec	*iov,
+	unsigned long		nr_segs,
 	loff_t			pos)
 {
-	return __xfs_file_read(iocb, buf, IO_ISAIO|IO_INVIS, count, pos);
+	return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos);
 }
 
 STATIC inline ssize_t
 __xfs_file_write(
-	struct kiocb	*iocb,
-	const char	__user *buf,
-	int		ioflags,
-	size_t		count,
-	loff_t		pos)
+	struct kiocb		*iocb,
+	const struct iovec	*iov,
+	unsigned long		nr_segs,
+	int			ioflags,
+	loff_t			pos)
 {
-	struct iovec	iov = {(void __user *)buf, count};
 	struct file	*file = iocb->ki_filp;
 	struct inode	*inode = file->f_mapping->host;
 	bhv_vnode_t	*vp = vn_from_inode(inode);
@@ -100,117 +99,28 @@ __xfs_file_write(
 	BUG_ON(iocb->ki_pos != pos);
 	if (unlikely(file->f_flags & O_DIRECT))
 		ioflags |= IO_ISDIRECT;
-	return bhv_vop_write(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL);
+	return bhv_vop_write(vp, iocb, iov, nr_segs, &iocb->ki_pos,
+				ioflags, NULL);
 }
 
 STATIC ssize_t
 xfs_file_aio_write(
 	struct kiocb		*iocb,
-	const char		__user *buf,
-	size_t			count,
+	const struct iovec	*iov,
+	unsigned long		nr_segs,
 	loff_t			pos)
 {
-	return __xfs_file_write(iocb, buf, IO_ISAIO, count, pos);
+	return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO, pos);
 }
 
 STATIC ssize_t
 xfs_file_aio_write_invis(
 	struct kiocb		*iocb,
-	const char		__user *buf,
-	size_t			count,
-	loff_t			pos)
-{
-	return __xfs_file_write(iocb, buf, IO_ISAIO|IO_INVIS, count, pos);
-}
-
-STATIC inline ssize_t
-__xfs_file_readv(
-	struct file		*file,
-	const struct iovec 	*iov,
-	int			ioflags,
-	unsigned long		nr_segs,
-	loff_t			*ppos)
-{
-	struct inode	*inode = file->f_mapping->host;
-	bhv_vnode_t	*vp = vn_from_inode(inode);
-	struct kiocb	kiocb;
-	ssize_t		rval;
-
-	init_sync_kiocb(&kiocb, file);
-	kiocb.ki_pos = *ppos;
-
-	if (unlikely(file->f_flags & O_DIRECT))
-		ioflags |= IO_ISDIRECT;
-	rval = bhv_vop_read(vp, &kiocb, iov, nr_segs,
-				&kiocb.ki_pos, ioflags, NULL);
-
-	*ppos = kiocb.ki_pos;
-	return rval;
-}
-
-STATIC ssize_t
-xfs_file_readv(
-	struct file		*file,
-	const struct iovec 	*iov,
-	unsigned long		nr_segs,
-	loff_t			*ppos)
-{
-	return __xfs_file_readv(file, iov, 0, nr_segs, ppos);
-}
-
-STATIC ssize_t
-xfs_file_readv_invis(
-	struct file		*file,
-	const struct iovec 	*iov,
-	unsigned long		nr_segs,
-	loff_t			*ppos)
-{
-	return __xfs_file_readv(file, iov, IO_INVIS, nr_segs, ppos);
-}
-
-STATIC inline ssize_t
-__xfs_file_writev(
-	struct file		*file,
-	const struct iovec 	*iov,
-	int			ioflags,
-	unsigned long		nr_segs,
-	loff_t			*ppos)
-{
-	struct inode	*inode = file->f_mapping->host;
-	bhv_vnode_t	*vp = vn_from_inode(inode);
-	struct kiocb	kiocb;
-	ssize_t		rval;
-
-	init_sync_kiocb(&kiocb, file);
-	kiocb.ki_pos = *ppos;
-	if (unlikely(file->f_flags & O_DIRECT))
-		ioflags |= IO_ISDIRECT;
-
-	rval = bhv_vop_write(vp, &kiocb, iov, nr_segs,
-				 &kiocb.ki_pos, ioflags, NULL);
-
-	*ppos = kiocb.ki_pos;
-	return rval;
-}
-
-STATIC ssize_t
-xfs_file_writev(
-	struct file		*file,
-	const struct iovec 	*iov,
+	const struct iovec	*iov,
 	unsigned long		nr_segs,
-	loff_t			*ppos)
-{
-	return __xfs_file_writev(file, iov, 0, nr_segs, ppos);
-}
-
-STATIC ssize_t
-xfs_file_writev_invis(
-	struct file		*file,
-	const struct iovec 	*iov,
-	unsigned long		nr_segs,
-	loff_t			*ppos)
+	loff_t			pos)
 {
-	return __xfs_file_writev(file, iov, IO_INVIS, nr_segs, ppos);
+	return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos);
 }
 
 STATIC ssize_t
@@ -370,7 +280,7 @@ xfs_file_readdir(
 
 	/* Try fairly hard to get memory */
 	do {
-		if ((read_buf = (caddr_t)kmalloc(rlen, GFP_KERNEL)))
+		if ((read_buf = kmalloc(rlen, GFP_KERNEL)))
 			break;
 		rlen >>= 1;
 	} while (rlen >= 1024);
@@ -540,8 +450,6 @@ const struct file_operations xfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
-	.readv		= xfs_file_readv,
-	.writev		= xfs_file_writev,
 	.aio_read	= xfs_file_aio_read,
 	.aio_write	= xfs_file_aio_write,
 	.sendfile	= xfs_file_sendfile,
@@ -565,8 +473,6 @@ const struct file_operations xfs_invis_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
-	.readv		= xfs_file_readv_invis,
-	.writev		= xfs_file_writev_invis,
 	.aio_read	= xfs_file_aio_read_invis,
 	.aio_write	= xfs_file_aio_write_invis,
 	.sendfile	= xfs_file_sendfile_invis,
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index 6c162c3dde7e..ed3a5e1b4b67 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -34,7 +34,7 @@ xfs_param_t xfs_params = {
 	.restrict_chown	= {	0,		1,		1	},
 	.sgid_inherit	= {	0,		0,		1	},
 	.symlink_mode	= {	0,		0,		1	},
-	.panic_mask	= {	0,		0,		127	},
+	.panic_mask	= {	0,		0,		255	},
 	.error_level	= {	0,		3,		11	},
 	.syncd_timer	= {	1*100,		30*100,		7200*100},
 	.stats_clear	= {	0,		0,		1	},
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 6e52a5dd38d8..a74f854d91e6 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -653,7 +653,7 @@ xfs_attrmulti_by_handle(
 STATIC int
 xfs_ioc_space(
 	bhv_desc_t		*bdp,
-	bhv_vnode_t		*vp,
+	struct inode		*inode,
 	struct file		*filp,
 	int			flags,
 	unsigned int		cmd,
@@ -735,7 +735,7 @@ xfs_ioctl(
 		    !capable(CAP_SYS_ADMIN))
 			return -EPERM;
 
-		return xfs_ioc_space(bdp, vp, filp, ioflags, cmd, arg);
+		return xfs_ioc_space(bdp, inode, filp, ioflags, cmd, arg);
 
 	case XFS_IOC_DIOINFO: {
 		struct dioattr	da;
@@ -763,6 +763,8 @@ xfs_ioctl(
 		return xfs_ioc_fsgeometry(mp, arg);
 
 	case XFS_IOC_GETVERSION:
+		return put_user(inode->i_generation, (int __user *)arg);
+
 	case XFS_IOC_GETXFLAGS:
 	case XFS_IOC_SETXFLAGS:
 	case XFS_IOC_FSGETXATTR:
@@ -957,7 +959,7 @@ xfs_ioctl(
 STATIC int
 xfs_ioc_space(
 	bhv_desc_t		*bdp,
-	bhv_vnode_t		*vp,
+	struct inode		*inode,
 	struct file		*filp,
 	int			ioflags,
 	unsigned int		cmd,
@@ -967,13 +969,13 @@ xfs_ioc_space(
 	int			attr_flags = 0;
 	int			error;
 
-	if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
+	if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
 		return -XFS_ERROR(EPERM);
 
 	if (!(filp->f_mode & FMODE_WRITE))
 		return -XFS_ERROR(EBADF);
 
-	if (!VN_ISREG(vp))
+	if (!S_ISREG(inode->i_mode))
 		return -XFS_ERROR(EINVAL);
 
 	if (copy_from_user(&bf, arg, sizeof(bf)))
@@ -1264,13 +1266,6 @@ xfs_ioc_xattr(
 		break;
 	}
 
-	case XFS_IOC_GETVERSION: {
-		flags = vn_to_inode(vp)->i_generation;
-		if (copy_to_user(arg, &flags, sizeof(flags)))
-			error = -EFAULT;
-		break;
-	}
-
 	default:
 		error = -ENOTTY;
 		break;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index d9180020de63..3ba814ae3bba 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -553,13 +553,13 @@ xfs_vn_follow_link(
 	ASSERT(dentry);
 	ASSERT(nd);
 
-	link = (char *)kmalloc(MAXPATHLEN+1, GFP_KERNEL);
+	link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
 	if (!link) {
 		nd_set_link(nd, ERR_PTR(-ENOMEM));
 		return NULL;
 	}
 
-	uio = (uio_t *)kmalloc(sizeof(uio_t), GFP_KERNEL);
+	uio = kmalloc(sizeof(uio_t), GFP_KERNEL);
 	if (!uio) {
 		kfree(link);
 		nd_set_link(nd, ERR_PTR(-ENOMEM));
@@ -623,12 +623,27 @@ xfs_vn_getattr(
 {
 	struct inode	*inode = dentry->d_inode;
 	bhv_vnode_t	*vp = vn_from_inode(inode);
-	int		error = 0;
+	bhv_vattr_t	vattr = { .va_mask = XFS_AT_STAT };
+	int		error;
 
-	if (unlikely(vp->v_flag & VMODIFIED))
-		error = vn_revalidate(vp);
-	if (!error)
-		generic_fillattr(inode, stat);
+	error = bhv_vop_getattr(vp, &vattr, ATTR_LAZY, NULL);
+	if (likely(!error)) {
+		stat->size = i_size_read(inode);
+		stat->dev = inode->i_sb->s_dev;
+		stat->rdev = (vattr.va_rdev == 0) ? 0 :
+				MKDEV(sysv_major(vattr.va_rdev) & 0x1ff,
+				      sysv_minor(vattr.va_rdev));
+		stat->mode = vattr.va_mode;
+		stat->nlink = vattr.va_nlink;
+		stat->uid = vattr.va_uid;
+		stat->gid = vattr.va_gid;
+		stat->ino = vattr.va_nodeid;
+		stat->atime = vattr.va_atime;
+		stat->mtime = vattr.va_mtime;
+		stat->ctime = vattr.va_ctime;
+		stat->blocks = vattr.va_nblocks;
+		stat->blksize = vattr.va_blocksize;
+	}
 	return -error;
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index a13f75c1a936..2b0e0018738a 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -148,11 +148,7 @@ BUFFER_FNS(PrivateStart, unwritten);
 		(current->flags = ((current->flags & ~(f)) | (*(sp) & (f))))
 
 #define NBPP		PAGE_SIZE
-#define DPPSHFT		(PAGE_SHIFT - 9)
 #define NDPP		(1 << (PAGE_SHIFT - 9))
-#define dtop(DD)	(((DD) + NDPP - 1) >> DPPSHFT)
-#define dtopt(DD)	((DD) >> DPPSHFT)
-#define dpoff(DD)	((DD) & (NDPP-1))
 
 #define NBBY		8		/* number of bits per byte */
 #define	NBPC		PAGE_SIZE	/* Number of bytes per click */
@@ -172,8 +168,6 @@ BUFFER_FNS(PrivateStart, unwritten);
 #define	btoct(x)	((__psunsigned_t)(x)>>BPCSHIFT)
 #define	btoc64(x)	(((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT)
 #define	btoct64(x)	((__uint64_t)(x)>>BPCSHIFT)
-#define	io_btoc(x)	(((__psunsigned_t)(x)+(IO_NBPC-1))>>IO_BPCSHIFT)
-#define	io_btoct(x)	((__psunsigned_t)(x)>>IO_BPCSHIFT)
 
 /* off_t bytes to clicks */
 #define offtoc(x)       (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT)
@@ -186,7 +180,6 @@ BUFFER_FNS(PrivateStart, unwritten);
 #define	ctob(x)		((__psunsigned_t)(x)<<BPCSHIFT)
 #define btoct(x)        ((__psunsigned_t)(x)>>BPCSHIFT)
 #define	ctob64(x)	((__uint64_t)(x)<<BPCSHIFT)
-#define	io_ctob(x)	((__psunsigned_t)(x)<<IO_BPCSHIFT)
 
 /* bytes to clicks */
 #define btoc(x)         (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT)
@@ -339,4 +332,11 @@ static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
 	return(x * y);
 }
 
+static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
+{
+	x += y - 1;
+	do_div(x, y);
+	return x;
+}
+
 #endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index ee788b1cb364..fa842f1c9fa2 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -270,16 +270,18 @@ xfs_read(
 		}
 	}
 
-	if (unlikely((ioflags & IO_ISDIRECT) && VN_CACHED(vp)))
-		bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)),
-						-1, FI_REMAPF_LOCKED);
-
-	if (unlikely(ioflags & IO_ISDIRECT))
+	if (unlikely(ioflags & IO_ISDIRECT)) {
+		if (VN_CACHED(vp))
+			bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)),
+						 -1, FI_REMAPF_LOCKED);
 		mutex_unlock(&inode->i_mutex);
+	}
 
 	xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
 				(void *)iovp, segs, *offset, ioflags);
-	ret = __generic_file_aio_read(iocb, iovp, segs, offset);
+
+	iocb->ki_pos = *offset;
+	ret = generic_file_aio_read(iocb, iovp, segs, *offset);
 	if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
 		ret = wait_on_sync_kiocb(iocb);
 	if (ret > 0)
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 4754f342a5d3..38c4d128a8c0 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -171,7 +171,6 @@ xfs_revalidate_inode(
 		break;
 	}
 
-	inode->i_blksize = xfs_preferred_iosize(mp);
 	inode->i_generation = ip->i_d.di_gen;
 	i_size_write(inode, ip->i_d.di_size);
 	inode->i_blocks =
@@ -228,7 +227,9 @@ xfs_initialize_vnode(
 		xfs_revalidate_inode(XFS_BHVTOM(bdp), vp, ip);
 		xfs_set_inodeops(inode);
 
+		spin_lock(&ip->i_flags_lock);
 		ip->i_flags &= ~XFS_INEW;
+		spin_unlock(&ip->i_flags_lock);
 		barrier();
 
 		unlock_new_inode(inode);
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
index 91fc2c4b3353..da255bdf5260 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -79,7 +79,7 @@ typedef enum {
 #define VFS_RDONLY		0x0001	/* read-only vfs */
 #define VFS_GRPID		0x0002	/* group-ID assigned from directory */
 #define VFS_DMI			0x0004	/* filesystem has the DMI enabled */
-#define VFS_UMOUNT		0x0008	/* unmount in progress */
+/* ---- VFS_UMOUNT ----		0x0008	-- unneeded, fixed via kthread APIs */
 #define VFS_32BITINODES		0x0010	/* do not use inums above 32 bits */
 #define VFS_END			0x0010	/* max flag */
 
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index 6628d96b6fd6..553fa731ade5 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -122,7 +122,6 @@ vn_revalidate_core(
 	inode->i_blocks	    = vap->va_nblocks;
 	inode->i_mtime	    = vap->va_mtime;
 	inode->i_ctime	    = vap->va_ctime;
-	inode->i_blksize    = vap->va_blocksize;
 	if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
 		inode->i_flags |= S_IMMUTABLE;
 	else
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index c42b3221b20c..515f5fdea57a 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -85,8 +85,6 @@ typedef enum {
 #define VN_BHV_HEAD(vp)			((bhv_head_t *)(&((vp)->v_bh)))
 #define vn_bhv_head_init(bhp,name)	bhv_head_init(bhp,name)
 #define vn_bhv_remove(bhp,bdp)		bhv_remove(bhp,bdp)
-#define vn_bhv_lookup(bhp,ops)		bhv_lookup(bhp,ops)
-#define vn_bhv_lookup_unlocked(bhp,ops) bhv_lookup_unlocked(bhp,ops)
 
 /*
  * Vnode to Linux inode mapping.
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 5b2dcc58b244..33ad5af386e0 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -382,18 +382,6 @@ xfs_qm_dquot_logitem_unlock(
 
 
 /*
- * The transaction with the dquot locked has aborted.  The dquot
- * must not be dirty within the transaction.  We simply unlock just
- * as if the transaction had been cancelled.
- */
-STATIC void
-xfs_qm_dquot_logitem_abort(
-	xfs_dq_logitem_t    *ql)
-{
-	xfs_qm_dquot_logitem_unlock(ql);
-}
-
-/*
  * this needs to stamp an lsn into the dquot, I think.
  * rpc's that look at user dquot's would then have to
  * push on the dependency recorded in the dquot
@@ -426,7 +414,6 @@ STATIC struct xfs_item_ops xfs_dquot_item_ops = {
 	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_qm_dquot_logitem_committed,
 	.iop_push	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push,
-	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_abort,
 	.iop_pushbuf	= (void(*)(xfs_log_item_t*))
 					xfs_qm_dquot_logitem_pushbuf,
 	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
@@ -559,17 +546,6 @@ xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn)
 }
 
 /*
- * The transaction of which this QUOTAOFF is a part has been aborted.
- * Just clean up after ourselves.
- * Shouldn't this never happen in the case of qoffend logitems? XXX
- */
-STATIC void
-xfs_qm_qoff_logitem_abort(xfs_qoff_logitem_t *qf)
-{
-	kmem_free(qf, sizeof(xfs_qoff_logitem_t));
-}
-
-/*
  * There isn't much you can do to push on an quotaoff item.  It is simply
  * stuck waiting for the log to be flushed to disk.
  */
@@ -644,7 +620,6 @@ STATIC struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
 	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_qm_qoffend_logitem_committed,
 	.iop_push	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
-	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort,
 	.iop_pushbuf	= NULL,
 	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_qm_qoffend_logitem_committing
@@ -667,7 +642,6 @@ STATIC struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
 	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_qm_qoff_logitem_committed,
 	.iop_push	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push,
-	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort,
 	.iop_pushbuf	= NULL,
 	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_qm_qoff_logitem_committing
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index e23e45535c48..7c6a3a50379e 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -112,17 +112,17 @@ xfs_Gqm_init(void)
 {
 	xfs_dqhash_t	*udqhash, *gdqhash;
 	xfs_qm_t	*xqm;
-	uint		i, hsize, flags = KM_SLEEP | KM_MAYFAIL;
+	size_t		hsize;
+	uint		i;
 
 	/*
 	 * Initialize the dquot hash tables.
 	 */
-	hsize = XFS_QM_HASHSIZE_HIGH;
-	while (!(udqhash = kmem_zalloc(hsize * sizeof(xfs_dqhash_t), flags))) {
-		if ((hsize >>= 1) <= XFS_QM_HASHSIZE_LOW)
-			flags = KM_SLEEP;
-	}
-	gdqhash = kmem_zalloc(hsize * sizeof(xfs_dqhash_t), KM_SLEEP);
+	udqhash = kmem_zalloc_greedy(&hsize,
+				     XFS_QM_HASHSIZE_LOW, XFS_QM_HASHSIZE_HIGH,
+				     KM_SLEEP | KM_MAYFAIL | KM_LARGE);
+	gdqhash = kmem_zalloc(hsize, KM_SLEEP | KM_LARGE);
+	hsize /= sizeof(xfs_dqhash_t);
 	ndquot = hsize << 8;
 
 	xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index 4568deb6da86..689407de0a20 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -56,12 +56,6 @@ extern kmem_zone_t	*qm_dqtrxzone;
 #define XFS_QM_HASHSIZE_HIGH		((NBPP * 4) / sizeof(xfs_dqhash_t))
 
 /*
- * We output a cmn_err when quotachecking a quota file with more than
- * this many fsbs.
- */
-#define XFS_QM_BIG_QCHECK_NBLKS		500
-
-/*
  * This defines the unit of allocation of dquots.
  * Currently, it is just one file system block, and a 4K blk contains 30
  * (136 * 30 = 4080) dquots. It's probably not worth trying to make
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index b7ddd04aae32..a8b85e2be9d5 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -75,7 +75,6 @@ static inline int XQMISLCKD(struct xfs_dqhash *h)
 
 #define xfs_qm_freelist_lock(qm)	XQMLCK(&((qm)->qm_dqfreelist))
 #define xfs_qm_freelist_unlock(qm)	XQMUNLCK(&((qm)->qm_dqfreelist))
-#define XFS_QM_IS_FREELIST_LOCKED(qm)	XQMISLCKD(&((qm)->qm_dqfreelist))
 
 /*
  * Hash into a bucket in the dquot hash table, based on <mp, id>.
@@ -170,6 +169,5 @@ for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
 #define DQFLAGTO_TYPESTR(d)	(((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
 				 (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
 				 (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
-#define DQFLAGTO_DIRTYSTR(d)	(XFS_DQ_IS_DIRTY(d) ? "DIRTY" : "NOTDIRTY")
 
 #endif	/* __XFS_QUOTA_PRIV_H__ */
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 36fbeccdc722..c75f68361e33 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -53,8 +53,7 @@ cmn_err(register int level, char *fmt, ...)
 	va_end(ap);
 	spin_unlock_irqrestore(&xfs_err_lock,flags);
 
-	if (level == CE_PANIC)
-		BUG();
+	BUG_ON(level == CE_PANIC);
 }
 
 void
@@ -72,8 +71,7 @@ icmn_err(register int level, char *fmt, va_list ap)
 		strcat(message, "\n");
 	spin_unlock_irqrestore(&xfs_err_lock,flags);
 	printk("%s%s", err_level[level], message);
-	if (level == CE_PANIC)
-		BUG();
+	BUG_ON(level == CE_PANIC);
 }
 
 void
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index addf5a7ea06c..5cf2e86caa71 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -75,7 +75,7 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
 							    sleep);
 	} else {
 		ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)),
-							    sleep);
+							    sleep | KM_LARGE);
 	}
 
 	if (ktep == NULL) {
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index dc2361dd740a..9ece7f87ec5b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -150,7 +150,7 @@ typedef struct xfs_agi {
 #define	XFS_BUF_TO_AGFL(bp)	((xfs_agfl_t *)XFS_BUF_PTR(bp))
 
 typedef struct xfs_agfl {
-	xfs_agblock_t	agfl_bno[1];	/* actually XFS_AGFL_SIZE(mp) */
+	__be32		agfl_bno[1];	/* actually XFS_AGFL_SIZE(mp) */
 } xfs_agfl_t;
 
 /*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index d2bbcd882a69..e80dda3437d1 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1477,8 +1477,10 @@ xfs_alloc_ag_vextent_small(
 	/*
 	 * Can't allocate from the freelist for some reason.
 	 */
-	else
+	else {
+		fbno = NULLAGBLOCK;
 		flen = 0;
+	}
 	/*
 	 * Can't do the allocation, give up.
 	 */
@@ -2021,7 +2023,7 @@ xfs_alloc_get_freelist(
 	/*
 	 * Get the block number and update the data structures.
 	 */
-	bno = INT_GET(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)], ARCH_CONVERT);
+	bno = be32_to_cpu(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
 	be32_add(&agf->agf_flfirst, 1);
 	xfs_trans_brelse(tp, agflbp);
 	if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
@@ -2108,7 +2110,7 @@ xfs_alloc_put_freelist(
 {
 	xfs_agf_t		*agf;	/* a.g. freespace structure */
 	xfs_agfl_t		*agfl;	/* a.g. free block array */
-	xfs_agblock_t		*blockp;/* pointer to array entry */
+	__be32			*blockp;/* pointer to array entry */
 	int			error;
 #ifdef XFS_ALLOC_TRACE
 	static char		fname[] = "xfs_alloc_put_freelist";
@@ -2132,7 +2134,7 @@ xfs_alloc_put_freelist(
 	pag->pagf_flcount++;
 	ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
 	blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)];
-	INT_SET(*blockp, ARCH_CONVERT, bno);
+	*blockp = cpu_to_be32(bno);
 	TRACE_MODAGF(NULL, agf, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
 	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
 	xfs_trans_log_buf(tp, agflbp,
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 7446556e8021..74cadf95d4e8 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -92,6 +92,7 @@ xfs_alloc_delrec(
 	xfs_alloc_key_t		*rkp;	/* right block key pointer */
 	xfs_alloc_ptr_t		*rpp;	/* right block address pointer */
 	int			rrecs=0;	/* number of records in right block */
+	int			numrecs;
 	xfs_alloc_rec_t		*rrp;	/* right block record pointer */
 	xfs_btree_cur_t		*tcur;	/* temporary btree cursor */
 
@@ -115,7 +116,8 @@ xfs_alloc_delrec(
 	/*
 	 * Fail if we're off the end of the block.
 	 */
-	if (ptr > be16_to_cpu(block->bb_numrecs)) {
+	numrecs = be16_to_cpu(block->bb_numrecs);
+	if (ptr > numrecs) {
 		*stat = 0;
 		return 0;
 	}
@@ -129,18 +131,18 @@ xfs_alloc_delrec(
 		lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
 		lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
 #ifdef DEBUG
-		for (i = ptr; i < be16_to_cpu(block->bb_numrecs); i++) {
+		for (i = ptr; i < numrecs; i++) {
 			if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
 				return error;
 		}
 #endif
-		if (ptr < be16_to_cpu(block->bb_numrecs)) {
+		if (ptr < numrecs) {
 			memmove(&lkp[ptr - 1], &lkp[ptr],
-				(be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lkp));
+				(numrecs - ptr) * sizeof(*lkp));
 			memmove(&lpp[ptr - 1], &lpp[ptr],
-				(be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lpp));
-			xfs_alloc_log_ptrs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1);
-			xfs_alloc_log_keys(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1);
+				(numrecs - ptr) * sizeof(*lpp));
+			xfs_alloc_log_ptrs(cur, bp, ptr, numrecs - 1);
+			xfs_alloc_log_keys(cur, bp, ptr, numrecs - 1);
 		}
 	}
 	/*
@@ -149,10 +151,10 @@ xfs_alloc_delrec(
 	 */
 	else {
 		lrp = XFS_ALLOC_REC_ADDR(block, 1, cur);
-		if (ptr < be16_to_cpu(block->bb_numrecs)) {
+		if (ptr < numrecs) {
 			memmove(&lrp[ptr - 1], &lrp[ptr],
-				(be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lrp));
-			xfs_alloc_log_recs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1);
+				(numrecs - ptr) * sizeof(*lrp));
+			xfs_alloc_log_recs(cur, bp, ptr, numrecs - 1);
 		}
 		/*
 		 * If it's the first record in the block, we'll need a key
@@ -167,7 +169,8 @@ xfs_alloc_delrec(
 	/*
 	 * Decrement and log the number of entries in the block.
 	 */
-	be16_add(&block->bb_numrecs, -1);
+	numrecs--;
+	block->bb_numrecs = cpu_to_be16(numrecs);
 	xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
 	/*
 	 * See if the longest free extent in the allocation group was
@@ -181,14 +184,14 @@ xfs_alloc_delrec(
 	if (level == 0 &&
 	    cur->bc_btnum == XFS_BTNUM_CNT &&
 	    be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
-	    ptr > be16_to_cpu(block->bb_numrecs)) {
-		ASSERT(ptr == be16_to_cpu(block->bb_numrecs) + 1);
+	    ptr > numrecs) {
+		ASSERT(ptr == numrecs + 1);
 		/*
 		 * There are still records in the block.  Grab the size
 		 * from the last one.
 		 */
-		if (be16_to_cpu(block->bb_numrecs)) {
-			rrp = XFS_ALLOC_REC_ADDR(block, be16_to_cpu(block->bb_numrecs), cur);
+		if (numrecs) {
+			rrp = XFS_ALLOC_REC_ADDR(block, numrecs, cur);
 			agf->agf_longest = rrp->ar_blockcount;
 		}
 		/*
@@ -211,7 +214,7 @@ xfs_alloc_delrec(
 		 * and it's NOT the leaf level,
 		 * then we can get rid of this level.
 		 */
-		if (be16_to_cpu(block->bb_numrecs) == 1 && level > 0) {
+		if (numrecs == 1 && level > 0) {
 			/*
 			 * lpp is still set to the first pointer in the block.
 			 * Make it the new root of the btree.
@@ -267,7 +270,7 @@ xfs_alloc_delrec(
 	 * If the number of records remaining in the block is at least
 	 * the minimum, we're done.
 	 */
-	if (be16_to_cpu(block->bb_numrecs) >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
+	if (numrecs >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
 		if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
 			return error;
 		*stat = 1;
@@ -419,19 +422,21 @@ xfs_alloc_delrec(
 	 * See if we can join with the left neighbor block.
 	 */
 	if (lbno != NULLAGBLOCK &&
-	    lrecs + be16_to_cpu(block->bb_numrecs) <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+	    lrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
 		/*
 		 * Set "right" to be the starting block,
 		 * "left" to be the left neighbor.
 		 */
 		rbno = bno;
 		right = block;
+		rrecs = be16_to_cpu(right->bb_numrecs);
 		rbp = bp;
 		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
 				cur->bc_private.a.agno, lbno, 0, &lbp,
 				XFS_ALLOC_BTREE_REF)))
 			return error;
 		left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
+		lrecs = be16_to_cpu(left->bb_numrecs);
 		if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
 			return error;
 	}
@@ -439,20 +444,21 @@ xfs_alloc_delrec(
 	 * If that won't work, see if we can join with the right neighbor block.
 	 */
 	else if (rbno != NULLAGBLOCK &&
-		 rrecs + be16_to_cpu(block->bb_numrecs) <=
-		  XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+		 rrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
 		/*
 		 * Set "left" to be the starting block,
 		 * "right" to be the right neighbor.
 		 */
 		lbno = bno;
 		left = block;
+		lrecs = be16_to_cpu(left->bb_numrecs);
 		lbp = bp;
 		if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
 				cur->bc_private.a.agno, rbno, 0, &rbp,
 				XFS_ALLOC_BTREE_REF)))
 			return error;
 		right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
+		rrecs = be16_to_cpu(right->bb_numrecs);
 		if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
 			return error;
 	}
@@ -474,34 +480,28 @@ xfs_alloc_delrec(
 		/*
 		 * It's a non-leaf.  Move keys and pointers.
 		 */
-		lkp = XFS_ALLOC_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur);
-		lpp = XFS_ALLOC_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur);
+		lkp = XFS_ALLOC_KEY_ADDR(left, lrecs + 1, cur);
+		lpp = XFS_ALLOC_PTR_ADDR(left, lrecs + 1, cur);
 		rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
 		rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
 #ifdef DEBUG
-		for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
+		for (i = 0; i < rrecs; i++) {
 			if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
 				return error;
 		}
 #endif
-		memcpy(lkp, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*lkp));
-		memcpy(lpp, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*lpp));
-		xfs_alloc_log_keys(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1,
-				   be16_to_cpu(left->bb_numrecs) +
-				   be16_to_cpu(right->bb_numrecs));
-		xfs_alloc_log_ptrs(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1,
-				   be16_to_cpu(left->bb_numrecs) +
-				   be16_to_cpu(right->bb_numrecs));
+		memcpy(lkp, rkp, rrecs * sizeof(*lkp));
+		memcpy(lpp, rpp, rrecs * sizeof(*lpp));
+		xfs_alloc_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
+		xfs_alloc_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
 	} else {
 		/*
 		 * It's a leaf.  Move records.
 		 */
-		lrp = XFS_ALLOC_REC_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur);
+		lrp = XFS_ALLOC_REC_ADDR(left, lrecs + 1, cur);
 		rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-		memcpy(lrp, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*lrp));
-		xfs_alloc_log_recs(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1,
-				   be16_to_cpu(left->bb_numrecs) +
-				   be16_to_cpu(right->bb_numrecs));
+		memcpy(lrp, rrp, rrecs * sizeof(*lrp));
+		xfs_alloc_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
 	}
 	/*
 	 * If we joined with the left neighbor, set the buffer in the
@@ -509,7 +509,7 @@ xfs_alloc_delrec(
 	 */
 	if (bp != lbp) {
 		xfs_btree_setbuf(cur, level, lbp);
-		cur->bc_ptrs[level] += be16_to_cpu(left->bb_numrecs);
+		cur->bc_ptrs[level] += lrecs;
 	}
 	/*
 	 * If we joined with the right neighbor and there's a level above
@@ -521,7 +521,8 @@ xfs_alloc_delrec(
 	/*
 	 * Fix up the number of records in the surviving block.
 	 */
-	be16_add(&left->bb_numrecs, be16_to_cpu(right->bb_numrecs));
+	lrecs += rrecs;
+	left->bb_numrecs = cpu_to_be16(lrecs);
 	/*
 	 * Fix up the right block pointer in the surviving block, and log it.
 	 */
@@ -608,6 +609,7 @@ xfs_alloc_insrec(
 	xfs_btree_cur_t		*ncur;	/* new cursor to be used at next lvl */
 	xfs_alloc_key_t		nkey;	/* new key value, from split */
 	xfs_alloc_rec_t		nrec;	/* new record value, for caller */
+	int			numrecs;
 	int			optr;	/* old ptr value */
 	xfs_alloc_ptr_t		*pp;	/* pointer to btree addresses */
 	int			ptr;	/* index in btree block for this rec */
@@ -653,13 +655,14 @@ xfs_alloc_insrec(
 	 */
 	bp = cur->bc_bufs[level];
 	block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+	numrecs = be16_to_cpu(block->bb_numrecs);
 #ifdef DEBUG
 	if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
 		return error;
 	/*
 	 * Check that the new entry is being inserted in the right place.
 	 */
-	if (ptr <= be16_to_cpu(block->bb_numrecs)) {
+	if (ptr <= numrecs) {
 		if (level == 0) {
 			rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
 			xfs_btree_check_rec(cur->bc_btnum, recp, rp);
@@ -670,12 +673,12 @@ xfs_alloc_insrec(
 	}
 #endif
 	nbno = NULLAGBLOCK;
-	ncur = (xfs_btree_cur_t *)0;
+	ncur = NULL;
 	/*
 	 * If the block is full, we can't insert the new entry until we
 	 * make the block un-full.
 	 */
-	if (be16_to_cpu(block->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+	if (numrecs == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
 		/*
 		 * First, try shifting an entry to the right neighbor.
 		 */
@@ -729,6 +732,7 @@ xfs_alloc_insrec(
 	 * At this point we know there's room for our new entry in the block
 	 * we're pointing at.
 	 */
+	numrecs = be16_to_cpu(block->bb_numrecs);
 	if (level > 0) {
 		/*
 		 * It's a non-leaf entry.  Make a hole for the new data
@@ -737,15 +741,15 @@ xfs_alloc_insrec(
 		kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
 		pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
 #ifdef DEBUG
-		for (i = be16_to_cpu(block->bb_numrecs); i >= ptr; i--) {
+		for (i = numrecs; i >= ptr; i--) {
 			if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level)))
 				return error;
 		}
 #endif
 		memmove(&kp[ptr], &kp[ptr - 1],
-			(be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*kp));
+			(numrecs - ptr + 1) * sizeof(*kp));
 		memmove(&pp[ptr], &pp[ptr - 1],
-			(be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*pp));
+			(numrecs - ptr + 1) * sizeof(*pp));
 #ifdef DEBUG
 		if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
 			return error;
@@ -755,11 +759,12 @@ xfs_alloc_insrec(
 		 */
 		kp[ptr - 1] = key;
 		pp[ptr - 1] = cpu_to_be32(*bnop);
-		be16_add(&block->bb_numrecs, 1);
-		xfs_alloc_log_keys(cur, bp, ptr, be16_to_cpu(block->bb_numrecs));
-		xfs_alloc_log_ptrs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs));
+		numrecs++;
+		block->bb_numrecs = cpu_to_be16(numrecs);
+		xfs_alloc_log_keys(cur, bp, ptr, numrecs);
+		xfs_alloc_log_ptrs(cur, bp, ptr, numrecs);
 #ifdef DEBUG
-		if (ptr < be16_to_cpu(block->bb_numrecs))
+		if (ptr < numrecs)
 			xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
 				kp + ptr);
 #endif
@@ -769,16 +774,17 @@ xfs_alloc_insrec(
 		 */
 		rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
 		memmove(&rp[ptr], &rp[ptr - 1],
-			(be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*rp));
+			(numrecs - ptr + 1) * sizeof(*rp));
 		/*
 		 * Now stuff the new record in, bump numrecs
 		 * and log the new data.
 		 */
-		rp[ptr - 1] = *recp; /* INT_: struct copy */
-		be16_add(&block->bb_numrecs, 1);
-		xfs_alloc_log_recs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs));
+		rp[ptr - 1] = *recp;
+		numrecs++;
+		block->bb_numrecs = cpu_to_be16(numrecs);
+		xfs_alloc_log_recs(cur, bp, ptr, numrecs);
 #ifdef DEBUG
-		if (ptr < be16_to_cpu(block->bb_numrecs))
+		if (ptr < numrecs)
 			xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
 				rp + ptr);
 #endif
@@ -819,8 +825,8 @@ xfs_alloc_insrec(
 	 */
 	*bnop = nbno;
 	if (nbno != NULLAGBLOCK) {
-		*recp = nrec; /* INT_: struct copy */
-		*curp = ncur; /* INT_: struct copy */
+		*recp = nrec;
+		*curp = ncur;
 	}
 	*stat = 1;
 	return 0;
@@ -981,7 +987,7 @@ xfs_alloc_lookup(
 		 */
 		bp = cur->bc_bufs[level];
 		if (bp && XFS_BUF_ADDR(bp) != d)
-			bp = (xfs_buf_t *)0;
+			bp = NULL;
 		if (!bp) {
 			/*
 			 * Need to get a new buffer.  Read it, then
@@ -1229,7 +1235,7 @@ xfs_alloc_lshift(
 		if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
 			return error;
 #endif
-		*lpp = *rpp; /* INT_: copy */
+		*lpp = *rpp;
 		xfs_alloc_log_ptrs(cur, lbp, nrec, nrec);
 		xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
 	}
@@ -1406,8 +1412,8 @@ xfs_alloc_newroot(
 
 		kp = XFS_ALLOC_KEY_ADDR(new, 1, cur);
 		if (be16_to_cpu(left->bb_level) > 0) {
-			kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur); /* INT_: structure copy */
-			kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);/* INT_: structure copy */
+			kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur);
+			kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);
 		} else {
 			xfs_alloc_rec_t	*rp;	/* btree record pointer */
 
@@ -1527,8 +1533,8 @@ xfs_alloc_rshift(
 		if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
 			return error;
 #endif
-		*rkp = *lkp; /* INT_: copy */
-		*rpp = *lpp; /* INT_: copy */
+		*rkp = *lkp;
+		*rpp = *lpp;
 		xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
 		xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
 		xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
@@ -2044,7 +2050,7 @@ xfs_alloc_insert(
 	nbno = NULLAGBLOCK;
 	nrec.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
 	nrec.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
-	ncur = (xfs_btree_cur_t *)0;
+	ncur = NULL;
 	pcur = cur;
 	/*
 	 * Loop going up the tree, starting at the leaf level.
@@ -2076,7 +2082,7 @@ xfs_alloc_insert(
 		 */
 		if (ncur) {
 			pcur = ncur;
-			ncur = (xfs_btree_cur_t *)0;
+			ncur = NULL;
 		}
 	} while (nbno != NULLAGBLOCK);
 	*stat = i;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 1a2101043275..9ada7bdbae52 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -91,7 +91,6 @@ STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
 /*
  * Routines to manipulate out-of-line attribute values.
  */
-STATIC int xfs_attr_rmtval_get(xfs_da_args_t *args);
 STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args);
 STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args);
 
@@ -180,7 +179,7 @@ xfs_attr_get(bhv_desc_t *bdp, const char *name, char *value, int *valuelenp,
 	return(error);
 }
 
-STATIC int
+int
 xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen,
 		 char *value, int valuelen, int flags)
 {
@@ -440,7 +439,7 @@ xfs_attr_set(bhv_desc_t *bdp, const char *name, char *value, int valuelen, int f
  * Generic handler routine to remove a name from an attribute list.
  * Transitions attribute list from Btree to shortform as necessary.
  */
-STATIC int
+int
 xfs_attr_remove_int(xfs_inode_t *dp, const char *name, int namelen, int flags)
 {
 	xfs_da_args_t	args;
@@ -591,6 +590,110 @@ xfs_attr_remove(bhv_desc_t *bdp, const char *name, int flags, struct cred *cred)
 	return xfs_attr_remove_int(dp, name, namelen, flags);
 }
 
+int								/* error */
+xfs_attr_list_int(xfs_attr_list_context_t *context)
+{
+	int error;
+	xfs_inode_t *dp = context->dp;
+
+	/*
+	 * Decide on what work routines to call based on the inode size.
+	 */
+	if (XFS_IFORK_Q(dp) == 0 ||
+	    (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+	     dp->i_d.di_anextents == 0)) {
+		error = 0;
+	} else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+		error = xfs_attr_shortform_list(context);
+	} else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+		error = xfs_attr_leaf_list(context);
+	} else {
+		error = xfs_attr_node_list(context);
+	}
+	return error;
+}
+
+#define	ATTR_ENTBASESIZE		/* minimum bytes used by an attr */ \
+	(((struct attrlist_ent *) 0)->a_name - (char *) 0)
+#define	ATTR_ENTSIZE(namelen)		/* actual bytes used by an attr */ \
+	((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
+	 & ~(sizeof(u_int32_t)-1))
+
+/*
+ * Format an attribute and copy it out to the user's buffer.
+ * Take care to check values and protect against them changing later,
+ * we may be reading them directly out of a user buffer.
+ */
+/*ARGSUSED*/
+STATIC int
+xfs_attr_put_listent(xfs_attr_list_context_t *context, attrnames_t *namesp,
+		     char *name, int namelen,
+		     int valuelen, char *value)
+{
+	attrlist_ent_t *aep;
+	int arraytop;
+
+	ASSERT(!(context->flags & ATTR_KERNOVAL));
+	ASSERT(context->count >= 0);
+	ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
+	ASSERT(context->firstu >= sizeof(*context->alist));
+	ASSERT(context->firstu <= context->bufsize);
+
+	arraytop = sizeof(*context->alist) +
+			context->count * sizeof(context->alist->al_offset[0]);
+	context->firstu -= ATTR_ENTSIZE(namelen);
+	if (context->firstu < arraytop) {
+		xfs_attr_trace_l_c("buffer full", context);
+		context->alist->al_more = 1;
+		context->seen_enough = 1;
+		return 1;
+	}
+
+	aep = (attrlist_ent_t *)&(((char *)context->alist)[ context->firstu ]);
+	aep->a_valuelen = valuelen;
+	memcpy(aep->a_name, name, namelen);
+	aep->a_name[ namelen ] = 0;
+	context->alist->al_offset[ context->count++ ] = context->firstu;
+	context->alist->al_count = context->count;
+	xfs_attr_trace_l_c("add", context);
+	return 0;
+}
+
+STATIC int
+xfs_attr_kern_list(xfs_attr_list_context_t *context, attrnames_t *namesp,
+		     char *name, int namelen,
+		     int valuelen, char *value)
+{
+	char *offset;
+	int arraytop;
+
+	ASSERT(context->count >= 0);
+
+	arraytop = context->count + namesp->attr_namelen + namelen + 1;
+	if (arraytop > context->firstu) {
+		context->count = -1;	/* insufficient space */
+		return 1;
+	}
+	offset = (char *)context->alist + context->count;
+	strncpy(offset, namesp->attr_name, namesp->attr_namelen);
+	offset += namesp->attr_namelen;
+	strncpy(offset, name, namelen);			/* real name */
+	offset += namelen;
+	*offset = '\0';
+	context->count += namesp->attr_namelen + namelen + 1;
+	return 0;
+}
+
+/*ARGSUSED*/
+STATIC int
+xfs_attr_kern_list_sizes(xfs_attr_list_context_t *context, attrnames_t *namesp,
+		     char *name, int namelen,
+		     int valuelen, char *value)
+{
+	context->count += namesp->attr_namelen + namelen + 1;
+	return 0;
+}
+
 /*
  * Generate a list of extended attribute names and optionally
  * also value lengths.  Positive return value follows the XFS
@@ -615,13 +718,13 @@ xfs_attr_list(bhv_desc_t *bdp, char *buffer, int bufsize, int flags,
 		return(XFS_ERROR(EINVAL));
 	if ((cursor->initted == 0) &&
 	    (cursor->hashval || cursor->blkno || cursor->offset))
-		return(XFS_ERROR(EINVAL));
+		return XFS_ERROR(EINVAL);
 
 	/*
 	 * Check for a properly aligned buffer.
 	 */
 	if (((long)buffer) & (sizeof(int)-1))
-		return(XFS_ERROR(EFAULT));
+		return XFS_ERROR(EFAULT);
 	if (flags & ATTR_KERNOVAL)
 		bufsize = 0;
 
@@ -634,53 +737,47 @@ xfs_attr_list(bhv_desc_t *bdp, char *buffer, int bufsize, int flags,
 	context.dupcnt = 0;
 	context.resynch = 1;
 	context.flags = flags;
-	if (!(flags & ATTR_KERNAMELS)) {
+	context.seen_enough = 0;
+	context.alist = (attrlist_t *)buffer;
+	context.put_value = 0;
+
+	if (flags & ATTR_KERNAMELS) {
+		context.bufsize = bufsize;
+		context.firstu = context.bufsize;
+		if (flags & ATTR_KERNOVAL)
+			context.put_listent = xfs_attr_kern_list_sizes;
+		else
+			context.put_listent = xfs_attr_kern_list;
+	} else {
 		context.bufsize = (bufsize & ~(sizeof(int)-1));  /* align */
 		context.firstu = context.bufsize;
-		context.alist = (attrlist_t *)buffer;
 		context.alist->al_count = 0;
 		context.alist->al_more = 0;
 		context.alist->al_offset[0] = context.bufsize;
-	}
-	else {
-		context.bufsize = bufsize;
-		context.firstu = context.bufsize;
-		context.alist = (attrlist_t *)buffer;
+		context.put_listent = xfs_attr_put_listent;
 	}
 
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
-		return (EIO);
+		return EIO;
 
 	xfs_ilock(dp, XFS_ILOCK_SHARED);
-	/*
-	 * Decide on what work routines to call based on the inode size.
-	 */
 	xfs_attr_trace_l_c("syscall start", &context);
-	if (XFS_IFORK_Q(dp) == 0 ||
-	    (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-	     dp->i_d.di_anextents == 0)) {
-		error = 0;
-	} else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
-		error = xfs_attr_shortform_list(&context);
-	} else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
-		error = xfs_attr_leaf_list(&context);
-	} else {
-		error = xfs_attr_node_list(&context);
-	}
+
+	error = xfs_attr_list_int(&context);
+
 	xfs_iunlock(dp, XFS_ILOCK_SHARED);
 	xfs_attr_trace_l_c("syscall end", &context);
 
-	if (!(context.flags & (ATTR_KERNOVAL|ATTR_KERNAMELS))) {
-		ASSERT(error >= 0);
-	}
-	else {	/* must return negated buffer size or the error */
+	if (context.flags & (ATTR_KERNOVAL|ATTR_KERNAMELS)) {
+		/* must return negated buffer size or the error */
 		if (context.count < 0)
 			error = XFS_ERROR(ERANGE);
 		else
 			error = -context.count;
-	}
+	} else
+		ASSERT(error >= 0);
 
-	return(error);
+	return error;
 }
 
 int								/* error */
@@ -1122,19 +1219,19 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context)
 	context->cursor->blkno = 0;
 	error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK);
 	if (error)
-		return(error);
+		return XFS_ERROR(error);
 	ASSERT(bp != NULL);
 	leaf = bp->data;
 	if (unlikely(be16_to_cpu(leaf->hdr.info.magic) != XFS_ATTR_LEAF_MAGIC)) {
 		XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW,
 				     context->dp->i_mount, leaf);
 		xfs_da_brelse(NULL, bp);
-		return(XFS_ERROR(EFSCORRUPTED));
+		return XFS_ERROR(EFSCORRUPTED);
 	}
 
-	(void)xfs_attr_leaf_list_int(bp, context);
+	error = xfs_attr_leaf_list_int(bp, context);
 	xfs_da_brelse(NULL, bp);
-	return(0);
+	return XFS_ERROR(error);
 }
 
 
@@ -1858,8 +1955,12 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 			return(XFS_ERROR(EFSCORRUPTED));
 		}
 		error = xfs_attr_leaf_list_int(bp, context);
-		if (error || !leaf->hdr.info.forw)
-			break;	/* not really an error, buffer full or EOF */
+		if (error) {
+			xfs_da_brelse(NULL, bp);
+			return error;
+		}
+		if (context->seen_enough || leaf->hdr.info.forw == 0)
+			break;
 		cursor->blkno = be32_to_cpu(leaf->hdr.info.forw);
 		xfs_da_brelse(NULL, bp);
 		error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
@@ -1886,7 +1987,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
  * Read the value associated with an attribute from the out-of-line buffer
  * that we stored it in.
  */
-STATIC int
+int
 xfs_attr_rmtval_get(xfs_da_args_t *args)
 {
 	xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE];
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index 981633f6c077..783977d3ea71 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -37,6 +37,7 @@
 
 struct cred;
 struct bhv_vnode;
+struct xfs_attr_list_context;
 
 typedef int (*attrset_t)(struct bhv_vnode *, char *, void *, size_t, int);
 typedef int (*attrget_t)(struct bhv_vnode *, char *, void *, size_t, int);
@@ -160,13 +161,16 @@ struct xfs_da_args;
  */
 int xfs_attr_get(bhv_desc_t *, const char *, char *, int *, int, struct cred *);
 int xfs_attr_set(bhv_desc_t *, const char *, char *, int, int, struct cred *);
+int xfs_attr_set_int(struct xfs_inode *, const char *, int, char *, int, int);
 int xfs_attr_remove(bhv_desc_t *, const char *, int, struct cred *);
-int xfs_attr_list(bhv_desc_t *, char *, int, int,
-			 struct attrlist_cursor_kern *, struct cred *);
+int xfs_attr_remove_int(struct xfs_inode *, const char *, int, int);
+int xfs_attr_list(bhv_desc_t *, char *, int, int, struct attrlist_cursor_kern *, struct cred *);
+int xfs_attr_list_int(struct xfs_attr_list_context *);
 int xfs_attr_inactive(struct xfs_inode *dp);
 
 int xfs_attr_shortform_getvalue(struct xfs_da_args *);
 int xfs_attr_fetch(struct xfs_inode *, const char *, int,
 			char *, int *, int, struct cred *);
+int xfs_attr_rmtval_get(struct xfs_da_args *args);
 
 #endif	/* __XFS_ATTR_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 9455051f0120..9719bbef122c 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -89,9 +89,46 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
 					 int dst_start, int move_count,
 					 xfs_mount_t *mp);
 STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
-STATIC int xfs_attr_put_listent(xfs_attr_list_context_t *context,
-			     attrnames_t *, char *name, int namelen,
-			     int valuelen);
+
+/*========================================================================
+ * Namespace helper routines
+ *========================================================================*/
+
+STATIC inline attrnames_t *
+xfs_attr_flags_namesp(int flags)
+{
+	return ((flags & XFS_ATTR_SECURE) ? &attr_secure:
+		  ((flags & XFS_ATTR_ROOT) ? &attr_trusted : &attr_user));
+}
+
+/*
+ * If namespace bits don't match return 0.
+ * If all match then return 1.
+ */
+STATIC inline int
+xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
+{
+	return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
+}
+
+/*
+ * If namespace bits don't match and we don't have an override for it
+ * then return 0.
+ * If all match or are overridable then return 1.
+ */
+STATIC inline int
+xfs_attr_namesp_match_overrides(int arg_flags, int ondisk_flags)
+{
+	if (((arg_flags & ATTR_SECURE) == 0) !=
+	    ((ondisk_flags & XFS_ATTR_SECURE) == 0) &&
+	    !(arg_flags & ATTR_KERNORMALS))
+		return 0;
+	if (((arg_flags & ATTR_ROOT) == 0) !=
+	    ((ondisk_flags & XFS_ATTR_ROOT) == 0) &&
+	    !(arg_flags & ATTR_KERNROOTLS))
+		return 0;
+	return 1;
+}
 
 
 /*========================================================================
@@ -228,11 +265,7 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
 			continue;
 		if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
 			continue;
-		if (((args->flags & ATTR_SECURE) != 0) !=
-		    ((sfe->flags & XFS_ATTR_SECURE) != 0))
-			continue;
-		if (((args->flags & ATTR_ROOT) != 0) !=
-		    ((sfe->flags & XFS_ATTR_ROOT) != 0))
+		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
 			continue;
 		ASSERT(0);
 #endif
@@ -246,8 +279,7 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
 
 	sfe->namelen = args->namelen;
 	sfe->valuelen = args->valuelen;
-	sfe->flags = (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE :
-			((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0);
+	sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
 	memcpy(sfe->nameval, args->name, args->namelen);
 	memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
 	sf->hdr.count++;
@@ -282,11 +314,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
 			continue;
 		if (memcmp(sfe->nameval, args->name, args->namelen) != 0)
 			continue;
-		if (((args->flags & ATTR_SECURE) != 0) !=
-		    ((sfe->flags & XFS_ATTR_SECURE) != 0))
-			continue;
-		if (((args->flags & ATTR_ROOT) != 0) !=
-		    ((sfe->flags & XFS_ATTR_ROOT) != 0))
+		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
 			continue;
 		break;
 	}
@@ -363,11 +391,7 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
 			continue;
 		if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
 			continue;
-		if (((args->flags & ATTR_SECURE) != 0) !=
-		    ((sfe->flags & XFS_ATTR_SECURE) != 0))
-			continue;
-		if (((args->flags & ATTR_ROOT) != 0) !=
-		    ((sfe->flags & XFS_ATTR_ROOT) != 0))
+		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
 			continue;
 		return(XFS_ERROR(EEXIST));
 	}
@@ -394,11 +418,7 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
 			continue;
 		if (memcmp(args->name, sfe->nameval, args->namelen) != 0)
 			continue;
-		if (((args->flags & ATTR_SECURE) != 0) !=
-		    ((sfe->flags & XFS_ATTR_SECURE) != 0))
-			continue;
-		if (((args->flags & ATTR_ROOT) != 0) !=
-		    ((sfe->flags & XFS_ATTR_ROOT) != 0))
+		if (!xfs_attr_namesp_match(args->flags, sfe->flags))
 			continue;
 		if (args->flags & ATTR_KERNOVAL) {
 			args->valuelen = sfe->valuelen;
@@ -485,8 +505,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
 		nargs.valuelen = sfe->valuelen;
 		nargs.hashval = xfs_da_hashname((char *)sfe->nameval,
 						sfe->namelen);
-		nargs.flags = (sfe->flags & XFS_ATTR_SECURE) ? ATTR_SECURE :
-				((sfe->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0);
+		nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
 		error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */
 		ASSERT(error == ENOATTR);
 		error = xfs_attr_leaf_add(bp, &nargs);
@@ -520,6 +539,10 @@ xfs_attr_shortform_compare(const void *a, const void *b)
 	}
 }
 
+
+#define XFS_ISRESET_CURSOR(cursor) \
+	(!((cursor)->initted) && !((cursor)->hashval) && \
+	 !((cursor)->blkno) && !((cursor)->offset))
 /*
  * Copy out entries of shortform attribute lists for attr_list().
  * Shortform attribute lists are not stored in hashval sorted order.
@@ -537,6 +560,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 	xfs_attr_sf_entry_t *sfe;
 	xfs_inode_t *dp;
 	int sbsize, nsbuf, count, i;
+	int error;
 
 	ASSERT(context != NULL);
 	dp = context->dp;
@@ -552,46 +576,51 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 	xfs_attr_trace_l_c("sf start", context);
 
 	/*
-	 * If the buffer is large enough, do not bother with sorting.
+	 * If the buffer is large enough and the cursor is at the start,
+	 * do not bother with sorting since we will return everything in
+	 * one buffer and another call using the cursor won't need to be
+	 * made.
 	 * Note the generous fudge factor of 16 overhead bytes per entry.
+	 * If bufsize is zero then put_listent must be a search function
+	 * and can just scan through what we have.
 	 */
-	if ((dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize) {
+	if (context->bufsize == 0 ||
+	    (XFS_ISRESET_CURSOR(cursor) &&
+             (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
 		for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
 			attrnames_t	*namesp;
 
-			if (((context->flags & ATTR_SECURE) != 0) !=
-			    ((sfe->flags & XFS_ATTR_SECURE) != 0) &&
-			    !(context->flags & ATTR_KERNORMALS)) {
-				sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
-				continue;
-			}
-			if (((context->flags & ATTR_ROOT) != 0) !=
-			    ((sfe->flags & XFS_ATTR_ROOT) != 0) &&
-			    !(context->flags & ATTR_KERNROOTLS)) {
+			if (!xfs_attr_namesp_match_overrides(context->flags, sfe->flags)) {
 				sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
 				continue;
 			}
-			namesp = (sfe->flags & XFS_ATTR_SECURE) ? &attr_secure:
-				((sfe->flags & XFS_ATTR_ROOT) ? &attr_trusted :
-				  &attr_user);
-			if (context->flags & ATTR_KERNOVAL) {
-				ASSERT(context->flags & ATTR_KERNAMELS);
-				context->count += namesp->attr_namelen +
-					sfe->namelen + 1;
-			}
-			else {
-				if (xfs_attr_put_listent(context, namesp,
-						   (char *)sfe->nameval,
-						   (int)sfe->namelen,
-						   (int)sfe->valuelen))
-					break;
-			}
+			namesp = xfs_attr_flags_namesp(sfe->flags);
+			error = context->put_listent(context,
+					   namesp,
+					   (char *)sfe->nameval,
+					   (int)sfe->namelen,
+					   (int)sfe->valuelen,
+					   (char*)&sfe->nameval[sfe->namelen]);
+
+			/*
+			 * Either search callback finished early or
+			 * didn't fit it all in the buffer after all.
+			 */
+			if (context->seen_enough)
+				break;
+
+			if (error)
+				return error;
 			sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
 		}
 		xfs_attr_trace_l_c("sf big-gulp", context);
 		return(0);
 	}
 
+	/* do no more for a search callback */
+	if (context->bufsize == 0)
+		return 0;
+
 	/*
 	 * It didn't all fit, so we have to sort everything on hashval.
 	 */
@@ -614,15 +643,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 			kmem_free(sbuf, sbsize);
 			return XFS_ERROR(EFSCORRUPTED);
 		}
-		if (((context->flags & ATTR_SECURE) != 0) !=
-		    ((sfe->flags & XFS_ATTR_SECURE) != 0) &&
-		    !(context->flags & ATTR_KERNORMALS)) {
-			sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
-			continue;
-		}
-		if (((context->flags & ATTR_ROOT) != 0) !=
-		    ((sfe->flags & XFS_ATTR_ROOT) != 0) &&
-		    !(context->flags & ATTR_KERNROOTLS)) {
+		if (!xfs_attr_namesp_match_overrides(context->flags, sfe->flags)) {
 			sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
 			continue;
 		}
@@ -671,24 +692,22 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 	for ( ; i < nsbuf; i++, sbp++) {
 		attrnames_t	*namesp;
 
-		namesp = (sbp->flags & XFS_ATTR_SECURE) ? &attr_secure :
-			((sbp->flags & XFS_ATTR_ROOT) ? &attr_trusted :
-			  &attr_user);
+		namesp = xfs_attr_flags_namesp(sbp->flags);
 
 		if (cursor->hashval != sbp->hash) {
 			cursor->hashval = sbp->hash;
 			cursor->offset = 0;
 		}
-		if (context->flags & ATTR_KERNOVAL) {
-			ASSERT(context->flags & ATTR_KERNAMELS);
-			context->count += namesp->attr_namelen +
-						sbp->namelen + 1;
-		} else {
-			if (xfs_attr_put_listent(context, namesp,
-					sbp->name, sbp->namelen,
-					sbp->valuelen))
-				break;
-		}
+		error = context->put_listent(context,
+					namesp,
+					sbp->name,
+					sbp->namelen,
+					sbp->valuelen,
+					&sbp->name[sbp->namelen]);
+		if (error)
+			return error;
+		if (context->seen_enough)
+			break;
 		cursor->offset++;
 	}
 
@@ -810,8 +829,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
 		nargs.value = (char *)&name_loc->nameval[nargs.namelen];
 		nargs.valuelen = be16_to_cpu(name_loc->valuelen);
 		nargs.hashval = be32_to_cpu(entry->hashval);
-		nargs.flags = (entry->flags & XFS_ATTR_SECURE) ? ATTR_SECURE :
-			      ((entry->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0);
+		nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags);
 		xfs_attr_shortform_add(&nargs, forkoff);
 	}
 	error = 0;
@@ -1098,8 +1116,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
 				     be16_to_cpu(map->size));
 	entry->hashval = cpu_to_be32(args->hashval);
 	entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
-	entry->flags |= (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE :
-			((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0);
+	entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
 	if (args->rename) {
 		entry->flags |= XFS_ATTR_INCOMPLETE;
 		if ((args->blkno2 == args->blkno) &&
@@ -1926,7 +1943,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
 		else
 			break;
 	}
-	ASSERT((probe >= 0) && 
+	ASSERT((probe >= 0) &&
 	       (!leaf->hdr.count
 	       || (probe < be16_to_cpu(leaf->hdr.count))));
 	ASSERT((span <= 4) || (be32_to_cpu(entry->hashval) == hashval));
@@ -1971,14 +1988,9 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
 			name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, probe);
 			if (name_loc->namelen != args->namelen)
 				continue;
-			if (memcmp(args->name, (char *)name_loc->nameval,
-					     args->namelen) != 0)
+			if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0)
 				continue;
-			if (((args->flags & ATTR_SECURE) != 0) !=
-			    ((entry->flags & XFS_ATTR_SECURE) != 0))
-				continue;
-			if (((args->flags & ATTR_ROOT) != 0) !=
-			    ((entry->flags & XFS_ATTR_ROOT) != 0))
+			if (!xfs_attr_namesp_match(args->flags, entry->flags))
 				continue;
 			args->index = probe;
 			return(XFS_ERROR(EEXIST));
@@ -1989,11 +2001,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
 			if (memcmp(args->name, (char *)name_rmt->name,
 					     args->namelen) != 0)
 				continue;
-			if (((args->flags & ATTR_SECURE) != 0) !=
-			    ((entry->flags & XFS_ATTR_SECURE) != 0))
-				continue;
-			if (((args->flags & ATTR_ROOT) != 0) !=
-			    ((entry->flags & XFS_ATTR_ROOT) != 0))
+			if (!xfs_attr_namesp_match(args->flags, entry->flags))
 				continue;
 			args->index = probe;
 			args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
@@ -2312,8 +2320,6 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 	attrlist_cursor_kern_t *cursor;
 	xfs_attr_leafblock_t *leaf;
 	xfs_attr_leaf_entry_t *entry;
-	xfs_attr_leaf_name_local_t *name_loc;
-	xfs_attr_leaf_name_remote_t *name_rmt;
 	int retval, i;
 
 	ASSERT(bp != NULL);
@@ -2355,9 +2361,8 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 	 * We have found our place, start copying out the new attributes.
 	 */
 	retval = 0;
-	for (  ; (i < be16_to_cpu(leaf->hdr.count))
-	     && (retval == 0); entry++, i++) {
-		attrnames_t	*namesp;
+	for (  ; (i < be16_to_cpu(leaf->hdr.count)); entry++, i++) {
+		attrnames_t *namesp;
 
 		if (be32_to_cpu(entry->hashval) != cursor->hashval) {
 			cursor->hashval = be32_to_cpu(entry->hashval);
@@ -2366,115 +2371,69 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 
 		if (entry->flags & XFS_ATTR_INCOMPLETE)
 			continue;		/* skip incomplete entries */
-		if (((context->flags & ATTR_SECURE) != 0) !=
-		    ((entry->flags & XFS_ATTR_SECURE) != 0) &&
-		    !(context->flags & ATTR_KERNORMALS))
-			continue;		/* skip non-matching entries */
-		if (((context->flags & ATTR_ROOT) != 0) !=
-		    ((entry->flags & XFS_ATTR_ROOT) != 0) &&
-		    !(context->flags & ATTR_KERNROOTLS))
-			continue;		/* skip non-matching entries */
-
-		namesp = (entry->flags & XFS_ATTR_SECURE) ? &attr_secure :
-			((entry->flags & XFS_ATTR_ROOT) ? &attr_trusted :
-			  &attr_user);
+		if (!xfs_attr_namesp_match_overrides(context->flags, entry->flags))
+			continue;
+
+		namesp = xfs_attr_flags_namesp(entry->flags);
 
 		if (entry->flags & XFS_ATTR_LOCAL) {
-			name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
-			if (context->flags & ATTR_KERNOVAL) {
-				ASSERT(context->flags & ATTR_KERNAMELS);
-				context->count += namesp->attr_namelen +
-						(int)name_loc->namelen + 1;
-			} else {
-				retval = xfs_attr_put_listent(context, namesp,
-					(char *)name_loc->nameval,
-					(int)name_loc->namelen,
-					be16_to_cpu(name_loc->valuelen));
-			}
+			xfs_attr_leaf_name_local_t *name_loc =
+				XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+
+			retval = context->put_listent(context,
+						namesp,
+						(char *)name_loc->nameval,
+						(int)name_loc->namelen,
+						be16_to_cpu(name_loc->valuelen),
+						(char *)&name_loc->nameval[name_loc->namelen]);
+			if (retval)
+				return retval;
 		} else {
-			name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
-			if (context->flags & ATTR_KERNOVAL) {
-				ASSERT(context->flags & ATTR_KERNAMELS);
-				context->count += namesp->attr_namelen +
-						(int)name_rmt->namelen + 1;
-			} else {
-				retval = xfs_attr_put_listent(context, namesp,
-					(char *)name_rmt->name,
-					(int)name_rmt->namelen,
-					be32_to_cpu(name_rmt->valuelen));
+			xfs_attr_leaf_name_remote_t *name_rmt =
+				XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+
+			int valuelen = be32_to_cpu(name_rmt->valuelen);
+
+			if (context->put_value) {
+				xfs_da_args_t args;
+
+				memset((char *)&args, 0, sizeof(args));
+				args.dp = context->dp;
+				args.whichfork = XFS_ATTR_FORK;
+				args.valuelen = valuelen;
+				args.value = kmem_alloc(valuelen, KM_SLEEP);
+				args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
+				args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
+				retval = xfs_attr_rmtval_get(&args);
+				if (retval)
+					return retval;
+				retval = context->put_listent(context,
+						namesp,
+						(char *)name_rmt->name,
+						(int)name_rmt->namelen,
+						valuelen,
+						(char*)args.value);
+				kmem_free(args.value, valuelen);
 			}
+			else {
+				retval = context->put_listent(context,
+						namesp,
+						(char *)name_rmt->name,
+						(int)name_rmt->namelen,
+						valuelen,
+						NULL);
+			}
+			if (retval)
+				return retval;
 		}
-		if (retval == 0) {
-			cursor->offset++;
-		}
+		if (context->seen_enough)
+			break;
+		cursor->offset++;
 	}
 	xfs_attr_trace_l_cl("blk end", context, leaf);
 	return(retval);
 }
 
-#define	ATTR_ENTBASESIZE		/* minimum bytes used by an attr */ \
-	(((struct attrlist_ent *) 0)->a_name - (char *) 0)
-#define	ATTR_ENTSIZE(namelen)		/* actual bytes used by an attr */ \
-	((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
-	 & ~(sizeof(u_int32_t)-1))
-
-/*
- * Format an attribute and copy it out to the user's buffer.
- * Take care to check values and protect against them changing later,
- * we may be reading them directly out of a user buffer.
- */
-/*ARGSUSED*/
-STATIC int
-xfs_attr_put_listent(xfs_attr_list_context_t *context,
-		     attrnames_t *namesp, char *name, int namelen, int valuelen)
-{
-	attrlist_ent_t *aep;
-	int arraytop;
-
-	ASSERT(!(context->flags & ATTR_KERNOVAL));
-	if (context->flags & ATTR_KERNAMELS) {
-		char *offset;
-
-		ASSERT(context->count >= 0);
-
-		arraytop = context->count + namesp->attr_namelen + namelen + 1;
-		if (arraytop > context->firstu) {
-			context->count = -1;	/* insufficient space */
-			return(1);
-		}
-		offset = (char *)context->alist + context->count;
-		strncpy(offset, namesp->attr_name, namesp->attr_namelen);
-		offset += namesp->attr_namelen;
-		strncpy(offset, name, namelen);			/* real name */
-		offset += namelen;
-		*offset = '\0';
-		context->count += namesp->attr_namelen + namelen + 1;
-		return(0);
-	}
-
-	ASSERT(context->count >= 0);
-	ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
-	ASSERT(context->firstu >= sizeof(*context->alist));
-	ASSERT(context->firstu <= context->bufsize);
-
-	arraytop = sizeof(*context->alist) +
-			context->count * sizeof(context->alist->al_offset[0]);
-	context->firstu -= ATTR_ENTSIZE(namelen);
-	if (context->firstu < arraytop) {
-		xfs_attr_trace_l_c("buffer full", context);
-		context->alist->al_more = 1;
-		return(1);
-	}
-
-	aep = (attrlist_ent_t *)&(((char *)context->alist)[ context->firstu ]);
-	aep->a_valuelen = valuelen;
-	memcpy(aep->a_name, name, namelen);
-	aep->a_name[ namelen ] = 0;
-	context->alist->al_offset[ context->count++ ] = context->firstu;
-	context->alist->al_count = context->count;
-	xfs_attr_trace_l_c("add", context);
-	return(0);
-}
 
 /*========================================================================
  * Manage the INCOMPLETE flag in a leaf entry
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 51c3ee156b2f..040f732ce1e2 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -130,6 +130,19 @@ typedef struct xfs_attr_leafblock {
 #define XFS_ATTR_INCOMPLETE	(1 << XFS_ATTR_INCOMPLETE_BIT)
 
 /*
+ * Conversion macros for converting namespace bits from argument flags
+ * to ondisk flags.
+ */
+#define XFS_ATTR_NSP_ARGS_MASK		(ATTR_ROOT | ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK_MASK	(XFS_ATTR_ROOT | XFS_ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK(flags)	((flags) & XFS_ATTR_NSP_ONDISK_MASK)
+#define XFS_ATTR_NSP_ARGS(flags)	((flags) & XFS_ATTR_NSP_ARGS_MASK)
+#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x)	(((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
+					 ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
+#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x)	(((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
+					 ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
+
+/*
  * Alignment for namelist and valuelist entries (since they are mixed
  * there can be only one alignment value)
  */
@@ -196,16 +209,26 @@ static inline int xfs_attr_leaf_entsize_local_max(int bsize)
  * Structure used to pass context around among the routines.
  *========================================================================*/
 
+
+struct xfs_attr_list_context;
+
+typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, struct attrnames *,
+				      char *, int, int, char *);
+
 typedef struct xfs_attr_list_context {
-	struct xfs_inode		*dp;	/* inode */
-	struct attrlist_cursor_kern	*cursor;/* position in list */
-	struct attrlist			*alist;	/* output buffer */
-	int				count;	/* num used entries */
-	int				dupcnt;	/* count dup hashvals seen */
-	int				bufsize;/* total buffer size */
-	int				firstu;	/* first used byte in buffer */
-	int				flags;	/* from VOP call */
-	int				resynch;/* T/F: resynch with cursor */
+	struct xfs_inode		*dp;		/* inode */
+	struct attrlist_cursor_kern	*cursor;	/* position in list */
+	struct attrlist			*alist;		/* output buffer */
+	int				seen_enough;	/* T/F: seen enough of list? */
+	int				count;		/* num used entries */
+	int				dupcnt;		/* count dup hashvals seen */
+	int				bufsize;	/* total buffer size */
+	int				firstu;		/* first used byte in buffer */
+	int				flags;		/* from VOP call */
+	int				resynch;	/* T/F: resynch with cursor */
+	int				put_value;	/* T/F: need value for listent */
+	put_listent_func_t		put_listent;	/* list output fmt function */
+	int				index;		/* index into output buffer */
 } xfs_attr_list_context_t;
 
 /*
diff --git a/fs/xfs/xfs_behavior.c b/fs/xfs/xfs_behavior.c
index f4fe3715a803..0dc17219d412 100644
--- a/fs/xfs/xfs_behavior.c
+++ b/fs/xfs/xfs_behavior.c
@@ -110,26 +110,6 @@ bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp)
 }
 
 /*
- * Look for a specific ops vector on the specified behavior chain.
- * Return the associated behavior descriptor.  Or NULL, if not found.
- */
-bhv_desc_t *
-bhv_lookup(bhv_head_t *bhp, void *ops)
-{
-	bhv_desc_t	*curdesc;
-
-	for (curdesc = bhp->bh_first;
-	     curdesc != NULL;
-	     curdesc = curdesc->bd_next) {
-
-		if (curdesc->bd_ops == ops)
-			return curdesc;
-	}
-
-	return NULL;
-}
-
-/*
  * Looks for the first behavior within a specified range of positions.
  * Return the associated behavior descriptor.  Or NULL, if none found.
  */
diff --git a/fs/xfs/xfs_behavior.h b/fs/xfs/xfs_behavior.h
index 6e6e56fb352d..e7ca1fed955a 100644
--- a/fs/xfs/xfs_behavior.h
+++ b/fs/xfs/xfs_behavior.h
@@ -176,12 +176,10 @@ extern void bhv_insert_initial(bhv_head_t *, bhv_desc_t *);
  * Behavior module prototypes.
  */
 extern void		bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp);
-extern bhv_desc_t *	bhv_lookup(bhv_head_t *bhp, void *ops);
 extern bhv_desc_t *	bhv_lookup_range(bhv_head_t *bhp, int low, int high);
 extern bhv_desc_t *	bhv_base(bhv_head_t *bhp);
 
 /* No bhv locking on Linux */
-#define bhv_lookup_unlocked	bhv_lookup
 #define bhv_base_unlocked	bhv_base
 
 #endif /* __XFS_BEHAVIOR_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index bf46fae303af..5b050c06795f 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2999,7 +2999,7 @@ xfs_bmap_btree_to_extents(
 	int			error;	/* error return value */
 	xfs_ifork_t		*ifp;	/* inode fork data */
 	xfs_mount_t		*mp;	/* mount point structure */
-	xfs_bmbt_ptr_t		*pp;	/* ptr to block address */
+	__be64			*pp;	/* ptr to block address */
 	xfs_bmbt_block_t	*rblock;/* root btree block */
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -3011,12 +3011,12 @@ xfs_bmap_btree_to_extents(
 	ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1);
 	mp = ip->i_mount;
 	pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes);
+	cbno = be64_to_cpu(*pp);
 	*logflagsp = 0;
 #ifdef DEBUG
-	if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), 1)))
+	if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
 		return error;
 #endif
-	cbno = INT_GET(*pp, ARCH_CONVERT);
 	if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
 			XFS_BMAP_BTREE_REF)))
 		return error;
@@ -3512,9 +3512,9 @@ xfs_bmap_extents_to_btree(
 	 */
 	kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
 	arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
-	INT_SET(kp->br_startoff, ARCH_CONVERT, xfs_bmbt_disk_get_startoff(arp));
+	kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
 	pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-	INT_SET(*pp, ARCH_CONVERT, args.fsbno);
+	*pp = cpu_to_be64(args.fsbno);
 	/*
 	 * Do all this logging at the end so that
 	 * the root is at the right level.
@@ -3705,7 +3705,7 @@ STATIC xfs_bmbt_rec_t *                 /* pointer to found extent entry */
 xfs_bmap_search_extents(
 	xfs_inode_t     *ip,            /* incore inode pointer */
 	xfs_fileoff_t   bno,            /* block number searched for */
-	int             whichfork,      /* data or attr fork */
+	int             fork,      	/* data or attr fork */
 	int             *eofp,          /* out: end of file found */
 	xfs_extnum_t    *lastxp,        /* out: last extent index */
 	xfs_bmbt_irec_t *gotp,          /* out: extent entry found */
@@ -3713,25 +3713,28 @@ xfs_bmap_search_extents(
 {
 	xfs_ifork_t	*ifp;		/* inode fork pointer */
 	xfs_bmbt_rec_t  *ep;            /* extent record pointer */
-	int		rt;		/* realtime flag    */
 
 	XFS_STATS_INC(xs_look_exlist);
-	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ifp = XFS_IFORK_PTR(ip, fork);
 
 	ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
 
-	rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
-	if (unlikely(!rt && !gotp->br_startblock && (*lastxp != NULLEXTNUM))) {
-                cmn_err(CE_PANIC,"Access to block zero: fs: <%s> inode: %lld "
-			"start_block : %llx start_off : %llx blkcnt : %llx "
-			"extent-state : %x \n",
-			(ip->i_mount)->m_fsname, (long long)ip->i_ino,
+	if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
+		     !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
+		xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount,
+				"Access to block zero in inode %llu "
+				"start_block: %llx start_off: %llx "
+				"blkcnt: %llx extent-state: %x lastx: %x\n",
+			(unsigned long long)ip->i_ino,
 			(unsigned long long)gotp->br_startblock,
 			(unsigned long long)gotp->br_startoff,
 			(unsigned long long)gotp->br_blockcount,
-			gotp->br_state);
-        }
-        return ep;
+			gotp->br_state, *lastxp);
+		*lastxp = NULLEXTNUM;
+		*eofp = 1;
+		return NULL;
+	}
+	return ep;
 }
 
 
@@ -4494,7 +4497,7 @@ xfs_bmap_read_extents(
 	xfs_ifork_t		*ifp;	/* fork structure */
 	int			level;	/* btree level, for checking */
 	xfs_mount_t		*mp;	/* file system mount structure */
-	xfs_bmbt_ptr_t		*pp;	/* pointer to block address */
+	__be64			*pp;	/* pointer to block address */
 	/* REFERENCED */
 	xfs_extnum_t		room;	/* number of entries there's room for */
 
@@ -4510,10 +4513,10 @@ xfs_bmap_read_extents(
 	level = be16_to_cpu(block->bb_level);
 	ASSERT(level > 0);
 	pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
-	ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
-	ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
-	ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
-	bno = INT_GET(*pp, ARCH_CONVERT);
+	bno = be64_to_cpu(*pp);
+	ASSERT(bno != NULLDFSBNO);
+	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
 	/*
 	 * Go down the tree until leaf level is reached, following the first
 	 * pointer (leftmost) at each level.
@@ -4530,10 +4533,8 @@ xfs_bmap_read_extents(
 			break;
 		pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block,
 			1, mp->m_bmap_dmxr[1]);
-		XFS_WANT_CORRUPTED_GOTO(
-			XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)),
-			error0);
-		bno = INT_GET(*pp, ARCH_CONVERT);
+		bno = be64_to_cpu(*pp);
+		XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
 		xfs_trans_brelse(tp, bp);
 	}
 	/*
@@ -6141,7 +6142,7 @@ xfs_check_block(
 	short			sz)
 {
 	int			i, j, dmxr;
-	xfs_bmbt_ptr_t		*pp, *thispa;	/* pointer to block address */
+	__be64			*pp, *thispa;	/* pointer to block address */
 	xfs_bmbt_key_t		*prevp, *keyp;
 
 	ASSERT(be16_to_cpu(block->bb_level) > 0);
@@ -6179,11 +6180,10 @@ xfs_check_block(
 				thispa = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize,
 					xfs_bmbt, block, j, dmxr);
 			}
-			if (INT_GET(*thispa, ARCH_CONVERT) ==
-			    INT_GET(*pp, ARCH_CONVERT)) {
+			if (*thispa == *pp) {
 				cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
 					__FUNCTION__, j, i,
-					INT_GET(*thispa, ARCH_CONVERT));
+					(unsigned long long)be64_to_cpu(*thispa));
 				panic("%s: ptrs are equal in node\n",
 					__FUNCTION__);
 			}
@@ -6210,7 +6210,7 @@ xfs_bmap_check_leaf_extents(
 	xfs_ifork_t		*ifp;	/* fork structure */
 	int			level;	/* btree level, for checking */
 	xfs_mount_t		*mp;	/* file system mount structure */
-	xfs_bmbt_ptr_t		*pp;	/* pointer to block address */
+	__be64			*pp;	/* pointer to block address */
 	xfs_bmbt_rec_t		*ep;	/* pointer to current extent */
 	xfs_bmbt_rec_t		*lastp; /* pointer to previous extent */
 	xfs_bmbt_rec_t		*nextp;	/* pointer to next extent */
@@ -6231,10 +6231,12 @@ xfs_bmap_check_leaf_extents(
 	ASSERT(level > 0);
 	xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
 	pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
-	ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
-	ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
-	ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
-	bno = INT_GET(*pp, ARCH_CONVERT);
+	bno = be64_to_cpu(*pp);
+
+	ASSERT(bno != NULLDFSBNO);
+	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
 	/*
 	 * Go down the tree until leaf level is reached, following the first
 	 * pointer (leftmost) at each level.
@@ -6265,8 +6267,8 @@ xfs_bmap_check_leaf_extents(
 		xfs_check_block(block, mp, 0, 0);
 		pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block,
 			1, mp->m_bmap_dmxr[1]);
-		XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)), error0);
-		bno = INT_GET(*pp, ARCH_CONVERT);
+		bno = be64_to_cpu(*pp);
+		XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
 		if (bp_release) {
 			bp_release = 0;
 			xfs_trans_brelse(NULL, bp);
@@ -6372,7 +6374,7 @@ xfs_bmap_count_blocks(
 	xfs_ifork_t		*ifp;	/* fork structure */
 	int			level;	/* btree level, for checking */
 	xfs_mount_t		*mp;	/* file system mount structure */
-	xfs_bmbt_ptr_t		*pp;	/* pointer to block address */
+	__be64			*pp;	/* pointer to block address */
 
 	bno = NULLFSBLOCK;
 	mp = ip->i_mount;
@@ -6395,10 +6397,10 @@ xfs_bmap_count_blocks(
 	level = be16_to_cpu(block->bb_level);
 	ASSERT(level > 0);
 	pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
-	ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO);
-	ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount);
-	ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks);
-	bno = INT_GET(*pp, ARCH_CONVERT);
+	bno = be64_to_cpu(*pp);
+	ASSERT(bno != NULLDFSBNO);
+	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
+	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
 
 	if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
 		XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
@@ -6425,7 +6427,7 @@ xfs_bmap_count_tree(
 	int			error;
 	xfs_buf_t		*bp, *nbp;
 	int			level = levelin;
-	xfs_bmbt_ptr_t          *pp;
+	__be64			*pp;
 	xfs_fsblock_t           bno = blockno;
 	xfs_fsblock_t		nextbno;
 	xfs_bmbt_block_t        *block, *nextblock;
@@ -6452,7 +6454,7 @@ xfs_bmap_count_tree(
 		/* Dive to the next level */
 		pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize,
 			xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
-		bno = INT_GET(*pp, ARCH_CONVERT);
+		bno = be64_to_cpu(*pp);
 		if (unlikely((error =
 		     xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
 			xfs_trans_brelse(tp, bp);
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 18fb7385d719..a7b835bf870a 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -58,7 +58,7 @@ STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
 STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *);
 STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *);
 STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *,
-		xfs_bmbt_key_t *, xfs_btree_cur_t **, int *);
+		__uint64_t *, xfs_btree_cur_t **, int *);
 STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int);
 
 
@@ -192,16 +192,11 @@ xfs_bmbt_trace_argifk(
 	xfs_btree_cur_t		*cur,
 	int			i,
 	xfs_fsblock_t		f,
-	xfs_bmbt_key_t		*k,
+	xfs_dfiloff_t		o,
 	int			line)
 {
-	xfs_dfsbno_t		d;
-	xfs_dfiloff_t		o;
-
-	d = (xfs_dfsbno_t)f;
-	o = INT_GET(k->br_startoff, ARCH_CONVERT);
 	xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
-		i, d >> 32, (int)d, o >> 32,
+		i, (xfs_dfsbno_t)f >> 32, (int)f, o >> 32,
 		(int)o, 0, 0, 0,
 		0, 0, 0);
 }
@@ -248,7 +243,7 @@ xfs_bmbt_trace_argik(
 {
 	xfs_dfiloff_t		o;
 
-	o = INT_GET(k->br_startoff, ARCH_CONVERT);
+	o = be64_to_cpu(k->br_startoff);
 	xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
 		i, o >> 32, (int)o, 0,
 		0, 0, 0, 0,
@@ -286,8 +281,8 @@ xfs_bmbt_trace_cursor(
 	xfs_bmbt_trace_argfffi(fname, c, o, b, i, j, __LINE__)
 #define	XFS_BMBT_TRACE_ARGI(c,i)	\
 	xfs_bmbt_trace_argi(fname, c, i, __LINE__)
-#define	XFS_BMBT_TRACE_ARGIFK(c,i,f,k)	\
-	xfs_bmbt_trace_argifk(fname, c, i, f, k, __LINE__)
+#define	XFS_BMBT_TRACE_ARGIFK(c,i,f,s)	\
+	xfs_bmbt_trace_argifk(fname, c, i, f, s, __LINE__)
 #define	XFS_BMBT_TRACE_ARGIFR(c,i,f,r)	\
 	xfs_bmbt_trace_argifr(fname, c, i, f, r, __LINE__)
 #define	XFS_BMBT_TRACE_ARGIK(c,i,k)	\
@@ -299,7 +294,7 @@ xfs_bmbt_trace_cursor(
 #define	XFS_BMBT_TRACE_ARGBII(c,b,i,j)
 #define	XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)
 #define	XFS_BMBT_TRACE_ARGI(c,i)
-#define	XFS_BMBT_TRACE_ARGIFK(c,i,f,k)
+#define	XFS_BMBT_TRACE_ARGIFK(c,i,f,s)
 #define	XFS_BMBT_TRACE_ARGIFR(c,i,f,r)
 #define	XFS_BMBT_TRACE_ARGIK(c,i,k)
 #define	XFS_BMBT_TRACE_CURSOR(c,s)
@@ -357,7 +352,7 @@ xfs_bmbt_delrec(
 	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
 	XFS_BMBT_TRACE_ARGI(cur, level);
 	ptr = cur->bc_ptrs[level];
-	tcur = (xfs_btree_cur_t *)0;
+	tcur = NULL;
 	if (ptr == 0) {
 		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
 		*stat = 0;
@@ -382,7 +377,7 @@ xfs_bmbt_delrec(
 		pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
 #ifdef DEBUG
 		for (i = ptr; i < numrecs; i++) {
-			if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) {
+			if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) {
 				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
 				goto error0;
 			}
@@ -404,7 +399,8 @@ xfs_bmbt_delrec(
 			xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1);
 		}
 		if (ptr == 1) {
-			INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_disk_get_startoff(rp));
+			key.br_startoff =
+				cpu_to_be64(xfs_bmbt_disk_get_startoff(rp));
 			kp = &key;
 		}
 	}
@@ -621,7 +617,7 @@ xfs_bmbt_delrec(
 		rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
 #ifdef DEBUG
 		for (i = 0; i < numrrecs; i++) {
-			if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) {
+			if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) {
 				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
 				goto error0;
 			}
@@ -748,7 +744,7 @@ xfs_bmbt_insrec(
 	int			logflags;	/* inode logging flags */
 	xfs_fsblock_t		nbno;		/* new block number */
 	struct xfs_btree_cur	*ncur;		/* new btree cursor */
-	xfs_bmbt_key_t		nkey;		/* new btree key value */
+	__uint64_t		startoff;	/* new btree key value */
 	xfs_bmbt_rec_t		nrec;		/* new record count */
 	int			optr;		/* old key/record index */
 	xfs_bmbt_ptr_t		*pp;		/* pointer to bmap block addr */
@@ -759,9 +755,8 @@ xfs_bmbt_insrec(
 	ASSERT(level < cur->bc_nlevels);
 	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
 	XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp);
-	ncur = (xfs_btree_cur_t *)0;
-	INT_SET(key.br_startoff, ARCH_CONVERT,
-		xfs_bmbt_disk_get_startoff(recp));
+	ncur = NULL;
+	key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(recp));
 	optr = ptr = cur->bc_ptrs[level];
 	if (ptr == 0) {
 		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
@@ -820,7 +815,7 @@ xfs_bmbt_insrec(
 					optr = ptr = cur->bc_ptrs[level];
 				} else {
 					if ((error = xfs_bmbt_split(cur, level,
-							&nbno, &nkey, &ncur,
+							&nbno, &startoff, &ncur,
 							&i))) {
 						XFS_BMBT_TRACE_CURSOR(cur,
 							ERROR);
@@ -840,7 +835,7 @@ xfs_bmbt_insrec(
 #endif
 						ptr = cur->bc_ptrs[level];
 						xfs_bmbt_disk_set_allf(&nrec,
-							nkey.br_startoff, 0, 0,
+							startoff, 0, 0,
 							XFS_EXT_NORM);
 					} else {
 						XFS_BMBT_TRACE_CURSOR(cur,
@@ -858,7 +853,7 @@ xfs_bmbt_insrec(
 		pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
 #ifdef DEBUG
 		for (i = numrecs; i >= ptr; i--) {
-			if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT),
+			if ((error = xfs_btree_check_lptr_disk(cur, pp[i - 1],
 					level))) {
 				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
 				return error;
@@ -870,14 +865,13 @@ xfs_bmbt_insrec(
 		memmove(&pp[ptr], &pp[ptr - 1], /* INT_: direct copy */
 			(numrecs - ptr + 1) * sizeof(*pp));
 #ifdef DEBUG
-		if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)*bnop,
-				level))) {
+		if ((error = xfs_btree_check_lptr(cur, *bnop, level))) {
 			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
 			return error;
 		}
 #endif
 		kp[ptr - 1] = key;
-		INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop);
+		pp[ptr - 1] = cpu_to_be64(*bnop);
 		numrecs++;
 		block->bb_numrecs = cpu_to_be16(numrecs);
 		xfs_bmbt_log_keys(cur, bp, ptr, numrecs);
@@ -988,7 +982,7 @@ xfs_bmbt_killroot(
 	cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
 #ifdef DEBUG
 	for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
-		if ((error = xfs_btree_check_lptr(cur, INT_GET(cpp[i], ARCH_CONVERT), level - 1))) {
+		if ((error = xfs_btree_check_lptr_disk(cur, cpp[i], level - 1))) {
 			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
 			return error;
 		}
@@ -1132,7 +1126,7 @@ xfs_bmbt_lookup(
 			d = XFS_FSB_TO_DADDR(mp, fsbno);
 			bp = cur->bc_bufs[level];
 			if (bp && XFS_BUF_ADDR(bp) != d)
-				bp = (xfs_buf_t *)0;
+				bp = NULL;
 			if (!bp) {
 				if ((error = xfs_btree_read_bufl(mp, tp, fsbno,
 						0, &bp, XFS_BMAP_BTREE_REF))) {
@@ -1170,7 +1164,7 @@ xfs_bmbt_lookup(
 				keyno = (low + high) >> 1;
 				if (level > 0) {
 					kkp = kkbase + keyno - 1;
-					startoff = INT_GET(kkp->br_startoff, ARCH_CONVERT);
+					startoff = be64_to_cpu(kkp->br_startoff);
 				} else {
 					krp = krbase + keyno - 1;
 					startoff = xfs_bmbt_disk_get_startoff(krp);
@@ -1189,13 +1183,13 @@ xfs_bmbt_lookup(
 			if (diff > 0 && --keyno < 1)
 				keyno = 1;
 			pp = XFS_BMAP_PTR_IADDR(block, keyno, cur);
+			fsbno = be64_to_cpu(*pp);
 #ifdef DEBUG
-			if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
+			if ((error = xfs_btree_check_lptr(cur, fsbno, level))) {
 				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
 				return error;
 			}
 #endif
-			fsbno = INT_GET(*pp, ARCH_CONVERT);
 			cur->bc_ptrs[level] = keyno;
 		}
 	}
@@ -1313,7 +1307,7 @@ xfs_bmbt_lshift(
 		lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur);
 		rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
 #ifdef DEBUG
-		if ((error = xfs_btree_check_lptr(cur, INT_GET(*rpp, ARCH_CONVERT), level))) {
+		if ((error = xfs_btree_check_lptr_disk(cur, *rpp, level))) {
 			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
 			return error;
 		}
@@ -1340,7 +1334,7 @@ xfs_bmbt_lshift(
 	if (level > 0) {
 #ifdef DEBUG
 		for (i = 0; i < rrecs; i++) {
-			if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT),
+			if ((error = xfs_btree_check_lptr_disk(cur, rpp[i + 1],
 					level))) {
 				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
 				return error;
@@ -1354,8 +1348,7 @@ xfs_bmbt_lshift(
 	} else {
 		memmove(rrp, rrp + 1, rrecs * sizeof(*rrp));
 		xfs_bmbt_log_recs(cur, rbp, 1, rrecs);
-		INT_SET(key.br_startoff, ARCH_CONVERT,
-			xfs_bmbt_disk_get_startoff(rrp));
+		key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp));
 		rkp = &key;
 	}
 	if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) {
@@ -1445,7 +1438,7 @@ xfs_bmbt_rshift(
 		rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
 #ifdef DEBUG
 		for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
-			if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) {
+			if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) {
 				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
 				return error;
 			}
@@ -1454,7 +1447,7 @@ xfs_bmbt_rshift(
 		memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
 		memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
 #ifdef DEBUG
-		if ((error = xfs_btree_check_lptr(cur, INT_GET(*lpp, ARCH_CONVERT), level))) {
+		if ((error = xfs_btree_check_lptr_disk(cur, *lpp, level))) {
 			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
 			return error;
 		}
@@ -1469,8 +1462,7 @@ xfs_bmbt_rshift(
 		memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
 		*rrp = *lrp;
 		xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-		INT_SET(key.br_startoff, ARCH_CONVERT,
-			xfs_bmbt_disk_get_startoff(rrp));
+		key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp));
 		rkp = &key;
 	}
 	be16_add(&left->bb_numrecs, -1);
@@ -1535,7 +1527,7 @@ xfs_bmbt_split(
 	xfs_btree_cur_t		*cur,
 	int			level,
 	xfs_fsblock_t		*bnop,
-	xfs_bmbt_key_t		*keyp,
+	__uint64_t		*startoff,
 	xfs_btree_cur_t		**curp,
 	int			*stat)		/* success/failure */
 {
@@ -1560,7 +1552,7 @@ xfs_bmbt_split(
 	xfs_bmbt_rec_t		*rrp;		/* right record pointer */
 
 	XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-	XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, keyp);
+	XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, *startoff);
 	args.tp = cur->bc_tp;
 	args.mp = cur->bc_mp;
 	lbp = cur->bc_bufs[level];
@@ -1619,7 +1611,7 @@ xfs_bmbt_split(
 		rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
 #ifdef DEBUG
 		for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-			if ((error = xfs_btree_check_lptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level))) {
+			if ((error = xfs_btree_check_lptr_disk(cur, lpp[i], level))) {
 				XFS_BMBT_TRACE_CURSOR(cur, ERROR);
 				return error;
 			}
@@ -1629,13 +1621,13 @@ xfs_bmbt_split(
 		memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
 		xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
 		xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-		keyp->br_startoff = INT_GET(rkp->br_startoff, ARCH_CONVERT);
+		*startoff = be64_to_cpu(rkp->br_startoff);
 	} else {
 		lrp = XFS_BMAP_REC_IADDR(left, i, cur);
 		rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
 		memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
 		xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-		keyp->br_startoff = xfs_bmbt_disk_get_startoff(rrp);
+		*startoff = xfs_bmbt_disk_get_startoff(rrp);
 	}
 	be16_add(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
 	right->bb_rightsib = left->bb_rightsib;
@@ -1728,9 +1720,9 @@ xfs_bmdr_to_bmbt(
 {
 	int			dmxr;
 	xfs_bmbt_key_t		*fkp;
-	xfs_bmbt_ptr_t		*fpp;
+	__be64			*fpp;
 	xfs_bmbt_key_t		*tkp;
-	xfs_bmbt_ptr_t		*tpp;
+	__be64			*tpp;
 
 	rblock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
 	rblock->bb_level = dblock->bb_level;
@@ -1745,7 +1737,7 @@ xfs_bmdr_to_bmbt(
 	tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
 	dmxr = be16_to_cpu(dblock->bb_numrecs);
 	memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
-	memcpy(tpp, fpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */
+	memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
 }
 
 /*
@@ -1805,7 +1797,7 @@ xfs_bmbt_decrement(
 	tp = cur->bc_tp;
 	mp = cur->bc_mp;
 	for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
-		fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+		fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur));
 		if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
 				XFS_BMAP_BTREE_REF))) {
 			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
@@ -2135,7 +2127,7 @@ xfs_bmbt_increment(
 	tp = cur->bc_tp;
 	mp = cur->bc_mp;
 	for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
-		fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT);
+		fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur));
 		if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
 				XFS_BMAP_BTREE_REF))) {
 			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
@@ -2178,7 +2170,7 @@ xfs_bmbt_insert(
 	level = 0;
 	nbno = NULLFSBLOCK;
 	xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
-	ncur = (xfs_btree_cur_t *)0;
+	ncur = NULL;
 	pcur = cur;
 	do {
 		if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur,
@@ -2205,7 +2197,7 @@ xfs_bmbt_insert(
 		}
 		if (ncur) {
 			pcur = ncur;
-			ncur = (xfs_btree_cur_t *)0;
+			ncur = NULL;
 		}
 	} while (nbno != NULLFSBLOCK);
 	XFS_BMBT_TRACE_CURSOR(cur, EXIT);
@@ -2356,12 +2348,12 @@ xfs_bmbt_newroot(
 	args.firstblock = args.fsbno;
 	if (args.fsbno == NULLFSBLOCK) {
 #ifdef DEBUG
-		if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
+		if ((error = xfs_btree_check_lptr_disk(cur, *pp, level))) {
 			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
 			return error;
 		}
 #endif
-		args.fsbno = INT_GET(*pp, ARCH_CONVERT);
+		args.fsbno = be64_to_cpu(*pp);
 		args.type = XFS_ALLOCTYPE_START_BNO;
 	} else
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
@@ -2393,7 +2385,7 @@ xfs_bmbt_newroot(
 	cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
 #ifdef DEBUG
 	for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
-		if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) {
+		if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) {
 			XFS_BMBT_TRACE_CURSOR(cur, ERROR);
 			return error;
 		}
@@ -2401,13 +2393,12 @@ xfs_bmbt_newroot(
 #endif
 	memcpy(cpp, pp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*pp));
 #ifdef DEBUG
-	if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)args.fsbno,
-			level))) {
+	if ((error = xfs_btree_check_lptr(cur, args.fsbno, level))) {
 		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
 		return error;
 	}
 #endif
-	INT_SET(*pp, ARCH_CONVERT, args.fsbno);
+	*pp = cpu_to_be64(args.fsbno);
 	xfs_iroot_realloc(cur->bc_private.b.ip, 1 - be16_to_cpu(cblock->bb_numrecs),
 		cur->bc_private.b.whichfork);
 	xfs_btree_setbuf(cur, level, bp);
@@ -2681,9 +2672,9 @@ xfs_bmbt_to_bmdr(
 {
 	int			dmxr;
 	xfs_bmbt_key_t		*fkp;
-	xfs_bmbt_ptr_t		*fpp;
+	__be64			*fpp;
 	xfs_bmbt_key_t		*tkp;
-	xfs_bmbt_ptr_t		*tpp;
+	__be64			*tpp;
 
 	ASSERT(be32_to_cpu(rblock->bb_magic) == XFS_BMAP_MAGIC);
 	ASSERT(be64_to_cpu(rblock->bb_leftsib) == NULLDFSBNO);
@@ -2698,7 +2689,7 @@ xfs_bmbt_to_bmdr(
 	tpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr);
 	dmxr = be16_to_cpu(dblock->bb_numrecs);
 	memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
-	memcpy(tpp, fpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */
+	memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
 }
 
 /*
@@ -2740,7 +2731,7 @@ xfs_bmbt_update(
 		XFS_BMBT_TRACE_CURSOR(cur, EXIT);
 		return 0;
 	}
-	INT_SET(key.br_startoff, ARCH_CONVERT, off);
+	key.br_startoff = cpu_to_be64(off);
 	if ((error = xfs_bmbt_updkey(cur, &key, 1))) {
 		XFS_BMBT_TRACE_CURSOR(cur, ERROR);
 		return error;
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 6478cfa0e539..49539de9525b 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -163,13 +163,14 @@ typedef struct xfs_bmbt_irec
 /*
  * Key structure for non-leaf levels of the tree.
  */
-typedef struct xfs_bmbt_key
-{
-	xfs_dfiloff_t	br_startoff;	/* starting file offset */
+typedef struct xfs_bmbt_key {
+	__be64		br_startoff;	/* starting file offset */
 } xfs_bmbt_key_t, xfs_bmdr_key_t;
 
-typedef xfs_dfsbno_t xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;	/* btree pointer type */
-					/* btree block header type */
+/* btree pointer type */
+typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
+
+/* btree block header type */
 typedef struct xfs_btree_lblock xfs_bmbt_block_t;
 
 #define XFS_BUF_TO_BMBT_BLOCK(bp)	((xfs_bmbt_block_t *)XFS_BUF_PTR(bp))
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index ee2255bd6562..aeb87ca69fcc 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -161,7 +161,7 @@ xfs_btree_check_key(
 
 		k1 = ak1;
 		k2 = ak2;
-		ASSERT(INT_GET(k1->br_startoff, ARCH_CONVERT) < INT_GET(k2->br_startoff, ARCH_CONVERT));
+		ASSERT(be64_to_cpu(k1->br_startoff) < be64_to_cpu(k2->br_startoff));
 		break;
 	    }
 	case XFS_BTNUM_INO: {
@@ -170,7 +170,7 @@ xfs_btree_check_key(
 
 		k1 = ak1;
 		k2 = ak2;
-		ASSERT(INT_GET(k1->ir_startino, ARCH_CONVERT) < INT_GET(k2->ir_startino, ARCH_CONVERT));
+		ASSERT(be32_to_cpu(k1->ir_startino) < be32_to_cpu(k2->ir_startino));
 		break;
 	    }
 	default:
@@ -285,8 +285,8 @@ xfs_btree_check_rec(
 
 		r1 = ar1;
 		r2 = ar2;
-		ASSERT(INT_GET(r1->ir_startino, ARCH_CONVERT) + XFS_INODES_PER_CHUNK <=
-		       INT_GET(r2->ir_startino, ARCH_CONVERT));
+		ASSERT(be32_to_cpu(r1->ir_startino) + XFS_INODES_PER_CHUNK <=
+		       be32_to_cpu(r2->ir_startino));
 		break;
 	    }
 	default:
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 44f1bd98064a..892b06c54263 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -145,7 +145,7 @@ typedef struct xfs_btree_cur
 	union {
 		xfs_alloc_rec_incore_t	a;
 		xfs_bmbt_irec_t		b;
-		xfs_inobt_rec_t		i;
+		xfs_inobt_rec_incore_t	i;
 	}		bc_rec;		/* current insert/search record value */
 	struct xfs_buf	*bc_bufs[XFS_BTREE_MAXLEVELS];	/* buf ptr per level */
 	int		bc_ptrs[XFS_BTREE_MAXLEVELS];	/* key/record # */
@@ -243,6 +243,9 @@ xfs_btree_check_lptr(
 	xfs_dfsbno_t		ptr,	/* btree block disk address */
 	int			level);	/* btree block level */
 
+#define xfs_btree_check_lptr_disk(cur, ptr, level) \
+	xfs_btree_check_lptr(cur, be64_to_cpu(ptr), level)
+
 /*
  * Checking routine: check that short form block header is ok.
  */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a4aa53974f76..7a55c248ea70 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -234,7 +234,6 @@ xfs_buf_item_format(
 	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
 	       (bip->bli_flags & XFS_BLI_STALE));
 	bp = bip->bli_buf;
-	ASSERT(XFS_BUF_BP_ISMAPPED(bp));
 	vecp = log_vector;
 
 	/*
@@ -628,25 +627,6 @@ xfs_buf_item_committed(
 }
 
 /*
- * This is called when the transaction holding the buffer is aborted.
- * Just behave as if the transaction had been cancelled. If we're shutting down
- * and have aborted this transaction, we'll trap this buffer when it tries to
- * get written out.
- */
-STATIC void
-xfs_buf_item_abort(
-	xfs_buf_log_item_t	*bip)
-{
-	xfs_buf_t	*bp;
-
-	bp = bip->bli_buf;
-	xfs_buftrace("XFS_ABORT", bp);
-	XFS_BUF_SUPER_STALE(bp);
-	xfs_buf_item_unlock(bip);
-	return;
-}
-
-/*
  * This is called to asynchronously write the buffer associated with this
  * buf log item out to disk. The buffer will already have been locked by
  * a successful call to xfs_buf_item_trylock().  If the buffer still has
@@ -693,7 +673,6 @@ STATIC struct xfs_item_ops xfs_buf_item_ops = {
 	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_buf_item_committed,
 	.iop_push	= (void(*)(xfs_log_item_t*))xfs_buf_item_push,
-	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_buf_item_abort,
 	.iop_pushbuf	= NULL,
 	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_buf_item_committing
@@ -901,7 +880,6 @@ xfs_buf_item_relse(
 	XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list);
 	if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) &&
 	    (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
-		ASSERT((XFS_BUF_ISUNINITIAL(bp)) == 0);
 		XFS_BUF_CLR_IODONE_FUNC(bp);
 	}
 
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 32ab61d17ace..a68bc1f1a313 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1054,7 +1054,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
 	xfs_da_node_entry_t *btree;
 	xfs_dablk_t blkno;
 	int probe, span, max, error, retval;
-	xfs_dahash_t hashval;
+	xfs_dahash_t hashval, btreehashval;
 	xfs_da_args_t *args;
 
 	args = state->args;
@@ -1079,30 +1079,32 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
 			return(error);
 		}
 		curr = blk->bp->data;
-		ASSERT(be16_to_cpu(curr->magic) == XFS_DA_NODE_MAGIC ||
-		       be16_to_cpu(curr->magic) == XFS_DIR2_LEAFN_MAGIC ||
-		       be16_to_cpu(curr->magic) == XFS_ATTR_LEAF_MAGIC);
+		blk->magic = be16_to_cpu(curr->magic);
+		ASSERT(blk->magic == XFS_DA_NODE_MAGIC ||
+		       blk->magic == XFS_DIR2_LEAFN_MAGIC ||
+		       blk->magic == XFS_ATTR_LEAF_MAGIC);
 
 		/*
 		 * Search an intermediate node for a match.
 		 */
-		blk->magic = be16_to_cpu(curr->magic);
 		if (blk->magic == XFS_DA_NODE_MAGIC) {
 			node = blk->bp->data;
-			blk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval);
+			max = be16_to_cpu(node->hdr.count);
+			btreehashval = node->btree[max-1].hashval;
+			blk->hashval = be32_to_cpu(btreehashval);
 
 			/*
 			 * Binary search.  (note: small blocks will skip loop)
 			 */
-			max = be16_to_cpu(node->hdr.count);
 			probe = span = max / 2;
 			hashval = args->hashval;
 			for (btree = &node->btree[probe]; span > 4;
 				   btree = &node->btree[probe]) {
 				span /= 2;
-				if (be32_to_cpu(btree->hashval) < hashval)
+				btreehashval = be32_to_cpu(btree->hashval);
+				if (btreehashval < hashval)
 					probe += span;
-				else if (be32_to_cpu(btree->hashval) > hashval)
+				else if (btreehashval > hashval)
 					probe -= span;
 				else
 					break;
@@ -1133,10 +1135,10 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
 				blk->index = probe;
 				blkno = be32_to_cpu(btree->before);
 			}
-		} else if (be16_to_cpu(curr->magic) == XFS_ATTR_LEAF_MAGIC) {
+		} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
 			blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
 			break;
-		} else if (be16_to_cpu(curr->magic) == XFS_DIR2_LEAFN_MAGIC) {
+		} else if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
 			blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
 			break;
 		}
@@ -1152,11 +1154,13 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
 		if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
 			retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
 							&blk->index, state);
-		}
-		else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+		} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
 			retval = xfs_attr_leaf_lookup_int(blk->bp, args);
 			blk->index = args->index;
 			args->blkno = blk->blkno;
+		} else {
+			ASSERT(0);
+			return XFS_ERROR(EFSCORRUPTED);
 		}
 		if (((retval == ENOENT) || (retval == ENOATTR)) &&
 		    (blk->hashval == args->hashval)) {
@@ -1166,8 +1170,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
 				return(error);
 			if (retval == 0) {
 				continue;
-			}
-			else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
+			} else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
 				/* path_shift() gives ENOENT */
 				retval = XFS_ERROR(ENOATTR);
 			}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index bc43163456ef..0893e16b7d83 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -18,14 +18,6 @@
 #ifndef	__XFS_ERROR_H__
 #define	__XFS_ERROR_H__
 
-#define XFS_ERECOVER	1	/* Failure to recover log */
-#define XFS_ELOGSTAT	2	/* Failure to stat log in user space */
-#define XFS_ENOLOGSPACE	3	/* Reservation too large */
-#define XFS_ENOTSUP	4	/* Operation not supported */
-#define	XFS_ENOLSN	5	/* Can't find the lsn you asked for */
-#define XFS_ENOTFOUND	6
-#define XFS_ENOTXFS	7	/* Not XFS filesystem */
-
 #ifdef DEBUG
 #define	XFS_ERROR_NTRAP	10
 extern int	xfs_etrap[XFS_ERROR_NTRAP];
@@ -175,6 +167,7 @@ extern int xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud);
 #define		XFS_PTAG_SHUTDOWN_CORRUPT	0x00000010
 #define		XFS_PTAG_SHUTDOWN_IOERROR	0x00000020
 #define		XFS_PTAG_SHUTDOWN_LOGERROR	0x00000040
+#define		XFS_PTAG_FSBLOCK_ZERO		0x00000080
 
 struct xfs_mount;
 /* PRINTFLIKE4 */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 6cf6d8769b97..6dba78199faf 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -33,9 +33,6 @@ kmem_zone_t	*xfs_efi_zone;
 kmem_zone_t	*xfs_efd_zone;
 
 STATIC void	xfs_efi_item_unlock(xfs_efi_log_item_t *);
-STATIC void	xfs_efi_item_abort(xfs_efi_log_item_t *);
-STATIC void	xfs_efd_item_abort(xfs_efd_log_item_t *);
-
 
 void
 xfs_efi_item_free(xfs_efi_log_item_t *efip)
@@ -184,7 +181,7 @@ STATIC void
 xfs_efi_item_unlock(xfs_efi_log_item_t *efip)
 {
 	if (efip->efi_item.li_flags & XFS_LI_ABORTED)
-		xfs_efi_item_abort(efip);
+		xfs_efi_item_free(efip);
 	return;
 }
 
@@ -202,18 +199,6 @@ xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn)
 }
 
 /*
- * This is called when the transaction logging the EFI is aborted.
- * Free up the EFI and return.  No need to clean up the slot for
- * the item in the transaction.  That was done by the unpin code
- * which is called prior to this routine in the abort/fs-shutdown path.
- */
-STATIC void
-xfs_efi_item_abort(xfs_efi_log_item_t *efip)
-{
-	xfs_efi_item_free(efip);
-}
-
-/*
  * There isn't much you can do to push on an efi item.  It is simply
  * stuck waiting for all of its corresponding efd items to be
  * committed to disk.
@@ -255,7 +240,6 @@ STATIC struct xfs_item_ops xfs_efi_item_ops = {
 	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_efi_item_committed,
 	.iop_push	= (void(*)(xfs_log_item_t*))xfs_efi_item_push,
-	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_efi_item_abort,
 	.iop_pushbuf	= NULL,
 	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_efi_item_committing
@@ -386,33 +370,6 @@ xfs_efi_release(xfs_efi_log_item_t	*efip,
 	}
 }
 
-/*
- * This is called when the transaction that should be committing the
- * EFD corresponding to the given EFI is aborted.  The committed and
- * canceled flags are used to coordinate the freeing of the EFI and
- * the references by the transaction that committed it.
- */
-STATIC void
-xfs_efi_cancel(
-	xfs_efi_log_item_t	*efip)
-{
-	xfs_mount_t	*mp;
-	SPLDECL(s);
-
-	mp = efip->efi_item.li_mountp;
-	AIL_LOCK(mp, s);
-	if (efip->efi_flags & XFS_EFI_COMMITTED) {
-		/*
-		 * xfs_trans_delete_ail() drops the AIL lock.
-		 */
-		xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s);
-		xfs_efi_item_free(efip);
-	} else {
-		efip->efi_flags |= XFS_EFI_CANCELED;
-		AIL_UNLOCK(mp, s);
-	}
-}
-
 STATIC void
 xfs_efd_item_free(xfs_efd_log_item_t *efdp)
 {
@@ -514,7 +471,7 @@ STATIC void
 xfs_efd_item_unlock(xfs_efd_log_item_t *efdp)
 {
 	if (efdp->efd_item.li_flags & XFS_LI_ABORTED)
-		xfs_efd_item_abort(efdp);
+		xfs_efd_item_free(efdp);
 	return;
 }
 
@@ -541,27 +498,6 @@ xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn)
 }
 
 /*
- * The transaction of which this EFD is a part has been aborted.
- * Inform its companion EFI of this fact and then clean up after
- * ourselves.  No need to clean up the slot for the item in the
- * transaction.  That was done by the unpin code which is called
- * prior to this routine in the abort/fs-shutdown path.
- */
-STATIC void
-xfs_efd_item_abort(xfs_efd_log_item_t *efdp)
-{
-	/*
-	 * If we got a log I/O error, it's always the case that the LR with the
-	 * EFI got unpinned and freed before the EFD got aborted. So don't
-	 * reference the EFI at all in that case.
-	 */
-	if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0)
-		xfs_efi_cancel(efdp->efd_efip);
-
-	xfs_efd_item_free(efdp);
-}
-
-/*
  * There isn't much you can do to push on an efd item.  It is simply
  * stuck waiting for the log to be flushed to disk.
  */
@@ -602,7 +538,6 @@ STATIC struct xfs_item_ops xfs_efd_item_ops = {
 	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_efd_item_committed,
 	.iop_push	= (void(*)(xfs_log_item_t*))xfs_efd_item_push,
-	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_efd_item_abort,
 	.iop_pushbuf	= NULL,
 	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_efd_item_committing
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 0ea45edaab03..2f049f63e85f 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -33,14 +33,16 @@ typedef struct xfs_extent {
  * conversion routine.
  */
 
+#ifndef HAVE_FORMAT32
 typedef struct xfs_extent_32 {
-	xfs_dfsbno_t	ext_start;
-	xfs_extlen_t	ext_len;
+	__uint64_t	ext_start;
+	__uint32_t	ext_len;
 } __attribute__((packed)) xfs_extent_32_t;
+#endif
 
 typedef struct xfs_extent_64 {
-	xfs_dfsbno_t	ext_start;
-	xfs_extlen_t	ext_len;
+	__uint64_t	ext_start;
+	__uint32_t	ext_len;
 	__uint32_t	ext_pad;
 } xfs_extent_64_t;
 
@@ -50,25 +52,27 @@ typedef struct xfs_extent_64 {
  * size is given by efi_nextents.
  */
 typedef struct xfs_efi_log_format {
-	unsigned short		efi_type;	/* efi log item type */
-	unsigned short		efi_size;	/* size of this item */
-	uint			efi_nextents;	/* # extents to free */
+	__uint16_t		efi_type;	/* efi log item type */
+	__uint16_t		efi_size;	/* size of this item */
+	__uint32_t		efi_nextents;	/* # extents to free */
 	__uint64_t		efi_id;		/* efi identifier */
 	xfs_extent_t		efi_extents[1];	/* array of extents to free */
 } xfs_efi_log_format_t;
 
+#ifndef HAVE_FORMAT32
 typedef struct xfs_efi_log_format_32 {
-	unsigned short		efi_type;	/* efi log item type */
-	unsigned short		efi_size;	/* size of this item */
-	uint			efi_nextents;	/* # extents to free */
+	__uint16_t		efi_type;	/* efi log item type */
+	__uint16_t		efi_size;	/* size of this item */
+	__uint32_t		efi_nextents;	/* # extents to free */
 	__uint64_t		efi_id;		/* efi identifier */
 	xfs_extent_32_t		efi_extents[1];	/* array of extents to free */
 } __attribute__((packed)) xfs_efi_log_format_32_t;
+#endif
 
 typedef struct xfs_efi_log_format_64 {
-	unsigned short		efi_type;	/* efi log item type */
-	unsigned short		efi_size;	/* size of this item */
-	uint			efi_nextents;	/* # extents to free */
+	__uint16_t		efi_type;	/* efi log item type */
+	__uint16_t		efi_size;	/* size of this item */
+	__uint32_t		efi_nextents;	/* # extents to free */
 	__uint64_t		efi_id;		/* efi identifier */
 	xfs_extent_64_t		efi_extents[1];	/* array of extents to free */
 } xfs_efi_log_format_64_t;
@@ -79,25 +83,27 @@ typedef struct xfs_efi_log_format_64 {
  * size is given by efd_nextents;
  */
 typedef struct xfs_efd_log_format {
-	unsigned short		efd_type;	/* efd log item type */
-	unsigned short		efd_size;	/* size of this item */
-	uint			efd_nextents;	/* # of extents freed */
+	__uint16_t		efd_type;	/* efd log item type */
+	__uint16_t		efd_size;	/* size of this item */
+	__uint32_t		efd_nextents;	/* # of extents freed */
 	__uint64_t		efd_efi_id;	/* id of corresponding efi */
 	xfs_extent_t		efd_extents[1];	/* array of extents freed */
 } xfs_efd_log_format_t;
 
+#ifndef HAVE_FORMAT32
 typedef struct xfs_efd_log_format_32 {
-	unsigned short		efd_type;	/* efd log item type */
-	unsigned short		efd_size;	/* size of this item */
-	uint			efd_nextents;	/* # of extents freed */
+	__uint16_t		efd_type;	/* efd log item type */
+	__uint16_t		efd_size;	/* size of this item */
+	__uint32_t		efd_nextents;	/* # of extents freed */
 	__uint64_t		efd_efi_id;	/* id of corresponding efi */
 	xfs_extent_32_t		efd_extents[1];	/* array of extents freed */
 } __attribute__((packed)) xfs_efd_log_format_32_t;
+#endif
 
 typedef struct xfs_efd_log_format_64 {
-	unsigned short		efd_type;	/* efd log item type */
-	unsigned short		efd_size;	/* size of this item */
-	uint			efd_nextents;	/* # of extents freed */
+	__uint16_t		efd_type;	/* efd log item type */
+	__uint16_t		efd_size;	/* size of this item */
+	__uint32_t		efd_nextents;	/* # of extents freed */
 	__uint64_t		efd_efi_id;	/* id of corresponding efi */
 	xfs_extent_64_t		efd_extents[1];	/* array of extents freed */
 } xfs_efd_log_format_64_t;
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 0f0ad1535951..1335449841cd 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -22,8 +22,6 @@
  * SGI's XFS filesystem's major stuff (constants, structures)
  */
 
-#define XFS_NAME	"xfs"
-
 /*
  * Direct I/O attribute record used with XFS_IOC_DIOINFO
  * d_miniosz is the min xfer size, xfer size multiple and file seek offset
@@ -426,11 +424,7 @@ typedef struct xfs_handle {
 				 - (char *) &(handle))			  \
 				 + (handle).ha_fid.xfs_fid_len)
 
-#define XFS_HANDLE_CMP(h1, h2)	memcmp(h1, h2, sizeof(xfs_handle_t))
-
-#define FSHSIZE		sizeof(fsid_t)
-
-/* 
+/*
  * Flags for going down operation
  */
 #define XFS_FSOP_GOING_FLAGS_DEFAULT		0x0	/* going down */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 33164a85aa9d..a446e5a115c6 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -458,7 +458,7 @@ nextag:
 		 */
 		if (XFS_FORCED_SHUTDOWN(mp)) {
 			up_read(&mp->m_peraglock);
-			return (xfs_buf_t *)0;
+			return NULL;
 		}
 		agno++;
 		if (agno >= agcount)
@@ -466,7 +466,7 @@ nextag:
 		if (agno == pagno) {
 			if (flags == 0) {
 				up_read(&mp->m_peraglock);
-				return (xfs_buf_t *)0;
+				return NULL;
 			}
 			flags = 0;
 		}
@@ -529,10 +529,10 @@ xfs_dialloc(
 	int		offset;		/* index of inode in chunk */
 	xfs_agino_t	pagino;		/* parent's a.g. relative inode # */
 	xfs_agnumber_t	pagno;		/* parent's allocation group number */
-	xfs_inobt_rec_t	rec;		/* inode allocation record */
+	xfs_inobt_rec_incore_t rec;	/* inode allocation record */
 	xfs_agnumber_t	tagno;		/* testing allocation group number */
 	xfs_btree_cur_t	*tcur;		/* temp cursor */
-	xfs_inobt_rec_t	trec;		/* temp inode allocation record */
+	xfs_inobt_rec_incore_t trec;	/* temp inode allocation record */
 
 
 	if (*IO_agbp == NULL) {
@@ -945,7 +945,7 @@ xfs_difree(
 	int		ilen;	/* inodes in an inode cluster */
 	xfs_mount_t	*mp;	/* mount structure for filesystem */
 	int		off;	/* offset of inode in inode chunk */
-	xfs_inobt_rec_t	rec;	/* btree record */
+	xfs_inobt_rec_incore_t rec;	/* btree record */
 
 	mp = tp->t_mountp;
 
@@ -1195,6 +1195,7 @@ xfs_dilocate(
 					"(0x%llx)",
 					ino, XFS_AGINO_TO_INO(mp, agno, agino));
 		}
+		xfs_stack_trace();
 #endif /* DEBUG */
 		return XFS_ERROR(EINVAL);
 	}
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 616eeeb6953e..8cdeeaf8632b 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -568,7 +568,7 @@ xfs_inobt_insrec(
 	/*
 	 * Make a key out of the record data to be inserted, and save it.
 	 */
-	key.ir_startino = recp->ir_startino; /* INT_: direct copy */
+	key.ir_startino = recp->ir_startino;
 	optr = ptr = cur->bc_ptrs[level];
 	/*
 	 * If we're off the left edge, return failure.
@@ -600,7 +600,7 @@ xfs_inobt_insrec(
 	}
 #endif
 	nbno = NULLAGBLOCK;
-	ncur = (xfs_btree_cur_t *)0;
+	ncur = NULL;
 	/*
 	 * If the block is full, we can't insert the new entry until we
 	 * make the block un-full.
@@ -641,7 +641,7 @@ xfs_inobt_insrec(
 						return error;
 #endif
 					ptr = cur->bc_ptrs[level];
-					nrec.ir_startino = nkey.ir_startino; /* INT_: direct copy */
+					nrec.ir_startino = nkey.ir_startino;
 				} else {
 					/*
 					 * Otherwise the insert fails.
@@ -681,7 +681,7 @@ xfs_inobt_insrec(
 		if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
 			return error;
 #endif
-		kp[ptr - 1] = key; /* INT_: struct copy */
+		kp[ptr - 1] = key;
 		pp[ptr - 1] = cpu_to_be32(*bnop);
 		numrecs++;
 		block->bb_numrecs = cpu_to_be16(numrecs);
@@ -698,7 +698,7 @@ xfs_inobt_insrec(
 		 * Now stuff the new record in, bump numrecs
 		 * and log the new data.
 		 */
-		rp[ptr - 1] = *recp; /* INT_: struct copy */
+		rp[ptr - 1] = *recp;
 		numrecs++;
 		block->bb_numrecs = cpu_to_be16(numrecs);
 		xfs_inobt_log_recs(cur, bp, ptr, numrecs);
@@ -731,7 +731,7 @@ xfs_inobt_insrec(
 	 */
 	*bnop = nbno;
 	if (nbno != NULLAGBLOCK) {
-		*recp = nrec; /* INT_: struct copy */
+		*recp = nrec;
 		*curp = ncur;
 	}
 	*stat = 1;
@@ -878,7 +878,7 @@ xfs_inobt_lookup(
 		 */
 		bp = cur->bc_bufs[level];
 		if (bp && XFS_BUF_ADDR(bp) != d)
-			bp = (xfs_buf_t *)0;
+			bp = NULL;
 		if (!bp) {
 			/*
 			 * Need to get a new buffer.  Read it, then
@@ -950,12 +950,12 @@ xfs_inobt_lookup(
 					xfs_inobt_key_t	*kkp;
 
 					kkp = kkbase + keyno - 1;
-					startino = INT_GET(kkp->ir_startino, ARCH_CONVERT);
+					startino = be32_to_cpu(kkp->ir_startino);
 				} else {
 					xfs_inobt_rec_t	*krp;
 
 					krp = krbase + keyno - 1;
-					startino = INT_GET(krp->ir_startino, ARCH_CONVERT);
+					startino = be32_to_cpu(krp->ir_startino);
 				}
 				/*
 				 * Compute difference to get next direction.
@@ -1117,7 +1117,7 @@ xfs_inobt_lshift(
 		if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
 			return error;
 #endif
-		*lpp = *rpp; /* INT_: no-change copy */
+		*lpp = *rpp;
 		xfs_inobt_log_ptrs(cur, lbp, nrec, nrec);
 	}
 	/*
@@ -1160,7 +1160,7 @@ xfs_inobt_lshift(
 	} else {
 		memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
 		xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-		key.ir_startino = rrp->ir_startino; /* INT_: direct copy */
+		key.ir_startino = rrp->ir_startino;
 		rkp = &key;
 	}
 	/*
@@ -1297,13 +1297,13 @@ xfs_inobt_newroot(
 	 */
 	kp = XFS_INOBT_KEY_ADDR(new, 1, cur);
 	if (be16_to_cpu(left->bb_level) > 0) {
-		kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur); /* INT_: struct copy */
-		kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur); /* INT_: struct copy */
+		kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur);
+		kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur);
 	} else {
 		rp = XFS_INOBT_REC_ADDR(left, 1, cur);
-		INT_COPY(kp[0].ir_startino, rp->ir_startino, ARCH_CONVERT);
+		kp[0].ir_startino = rp->ir_startino;
 		rp = XFS_INOBT_REC_ADDR(right, 1, cur);
-		INT_COPY(kp[1].ir_startino, rp->ir_startino, ARCH_CONVERT);
+		kp[1].ir_startino = rp->ir_startino;
 	}
 	xfs_inobt_log_keys(cur, nbp, 1, 2);
 	/*
@@ -1410,8 +1410,8 @@ xfs_inobt_rshift(
 		if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
 			return error;
 #endif
-		*rkp = *lkp; /* INT_: no change copy */
-		*rpp = *lpp; /* INT_: no change copy */
+		*rkp = *lkp;
+		*rpp = *lpp;
 		xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
 		xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
 	} else {
@@ -1420,7 +1420,7 @@ xfs_inobt_rshift(
 		memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
 		*rrp = *lrp;
 		xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-		key.ir_startino = rrp->ir_startino; /* INT_: direct copy */
+		key.ir_startino = rrp->ir_startino;
 		rkp = &key;
 	}
 	/*
@@ -1559,7 +1559,7 @@ xfs_inobt_split(
 		rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
 		memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
 		xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-		keyp->ir_startino = rrp->ir_startino; /* INT_: direct copy */
+		keyp->ir_startino = rrp->ir_startino;
 	}
 	/*
 	 * Find the left block number by looking in the buffer.
@@ -1813,9 +1813,9 @@ xfs_inobt_get_rec(
 	 * Point to the record and extract its data.
 	 */
 	rec = XFS_INOBT_REC_ADDR(block, ptr, cur);
-	*ino = INT_GET(rec->ir_startino, ARCH_CONVERT);
-	*fcnt = INT_GET(rec->ir_freecount, ARCH_CONVERT);
-	*free = INT_GET(rec->ir_free, ARCH_CONVERT);
+	*ino = be32_to_cpu(rec->ir_startino);
+	*fcnt = be32_to_cpu(rec->ir_freecount);
+	*free = be64_to_cpu(rec->ir_free);
 	*stat = 1;
 	return 0;
 }
@@ -1930,10 +1930,10 @@ xfs_inobt_insert(
 
 	level = 0;
 	nbno = NULLAGBLOCK;
-	INT_SET(nrec.ir_startino, ARCH_CONVERT, cur->bc_rec.i.ir_startino);
-	INT_SET(nrec.ir_freecount, ARCH_CONVERT, cur->bc_rec.i.ir_freecount);
-	INT_SET(nrec.ir_free, ARCH_CONVERT, cur->bc_rec.i.ir_free);
-	ncur = (xfs_btree_cur_t *)0;
+	nrec.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
+	nrec.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+	nrec.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
+	ncur = NULL;
 	pcur = cur;
 	/*
 	 * Loop going up the tree, starting at the leaf level.
@@ -1965,7 +1965,7 @@ xfs_inobt_insert(
 		 */
 		if (ncur) {
 			pcur = ncur;
-			ncur = (xfs_btree_cur_t *)0;
+			ncur = NULL;
 		}
 	} while (nbno != NULLAGBLOCK);
 	*stat = i;
@@ -2060,9 +2060,9 @@ xfs_inobt_update(
 	/*
 	 * Fill in the new contents and log them.
 	 */
-	INT_SET(rp->ir_startino, ARCH_CONVERT, ino);
-	INT_SET(rp->ir_freecount, ARCH_CONVERT, fcnt);
-	INT_SET(rp->ir_free, ARCH_CONVERT, free);
+	rp->ir_startino = cpu_to_be32(ino);
+	rp->ir_freecount = cpu_to_be32(fcnt);
+	rp->ir_free = cpu_to_be64(free);
 	xfs_inobt_log_recs(cur, bp, ptr, ptr);
 	/*
 	 * Updating first record in leaf. Pass new key value up to our parent.
@@ -2070,7 +2070,7 @@ xfs_inobt_update(
 	if (ptr == 1) {
 		xfs_inobt_key_t	key;	/* key containing [ino] */
 
-		INT_SET(key.ir_startino, ARCH_CONVERT, ino);
+		key.ir_startino = cpu_to_be32(ino);
 		if ((error = xfs_inobt_updkey(cur, &key, 1)))
 			return error;
 	}
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index ae3904cb1ee8..2c0e49893ff7 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -47,19 +47,24 @@ static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
 /*
  * Data record structure
  */
-typedef struct xfs_inobt_rec
-{
+typedef struct xfs_inobt_rec {
+	__be32		ir_startino;	/* starting inode number */
+	__be32		ir_freecount;	/* count of free inodes (set bits) */
+	__be64		ir_free;	/* free inode mask */
+} xfs_inobt_rec_t;
+
+typedef struct xfs_inobt_rec_incore {
 	xfs_agino_t	ir_startino;	/* starting inode number */
 	__int32_t	ir_freecount;	/* count of free inodes (set bits) */
 	xfs_inofree_t	ir_free;	/* free inode mask */
-} xfs_inobt_rec_t;
+} xfs_inobt_rec_incore_t;
+
 
 /*
  * Key structure
  */
-typedef struct xfs_inobt_key
-{
-	xfs_agino_t	ir_startino;	/* starting inode number */
+typedef struct xfs_inobt_key {
+	__be32		ir_startino;	/* starting inode number */
 } xfs_inobt_key_t;
 
 /* btree pointer type */
@@ -77,7 +82,7 @@ typedef	struct xfs_btree_sblock xfs_inobt_block_t;
 #define	XFS_INOBT_IS_FREE(rp,i)		\
 		(((rp)->ir_free & XFS_INOBT_MASK(i)) != 0)
 #define	XFS_INOBT_IS_FREE_DISK(rp,i)	\
-		((INT_GET((rp)->ir_free,ARCH_CONVERT) & XFS_INOBT_MASK(i)) != 0)
+		((be64_to_cpu((rp)->ir_free) & XFS_INOBT_MASK(i)) != 0)
 #define	XFS_INOBT_SET_FREE(rp,i)	((rp)->ir_free |= XFS_INOBT_MASK(i))
 #define	XFS_INOBT_CLR_FREE(rp,i)	((rp)->ir_free &= ~XFS_INOBT_MASK(i))
 
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 0724df7fabb7..b73d216ecaf9 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -50,7 +50,7 @@ void
 xfs_ihash_init(xfs_mount_t *mp)
 {
 	__uint64_t	icount;
-	uint		i, flags = KM_SLEEP | KM_MAYFAIL;
+	uint		i;
 
 	if (!mp->m_ihsize) {
 		icount = mp->m_maxicount ? mp->m_maxicount :
@@ -61,14 +61,13 @@ xfs_ihash_init(xfs_mount_t *mp)
 					(64 * NBPP) / sizeof(xfs_ihash_t));
 	}
 
-	while (!(mp->m_ihash = (xfs_ihash_t *)kmem_zalloc(mp->m_ihsize *
-						sizeof(xfs_ihash_t), flags))) {
-		if ((mp->m_ihsize >>= 1) <= NBPP)
-			flags = KM_SLEEP;
-	}
-	for (i = 0; i < mp->m_ihsize; i++) {
+	mp->m_ihash = kmem_zalloc_greedy(&mp->m_ihsize,
+					 NBPC * sizeof(xfs_ihash_t),
+					 mp->m_ihsize * sizeof(xfs_ihash_t),
+					 KM_SLEEP | KM_MAYFAIL | KM_LARGE);
+	mp->m_ihsize /= sizeof(xfs_ihash_t);
+	for (i = 0; i < mp->m_ihsize; i++)
 		rwlock_init(&(mp->m_ihash[i].ih_lock));
-	}
 }
 
 /*
@@ -77,7 +76,7 @@ xfs_ihash_init(xfs_mount_t *mp)
 void
 xfs_ihash_free(xfs_mount_t *mp)
 {
-	kmem_free(mp->m_ihash, mp->m_ihsize*sizeof(xfs_ihash_t));
+	kmem_free(mp->m_ihash, mp->m_ihsize * sizeof(xfs_ihash_t));
 	mp->m_ihash = NULL;
 }
 
@@ -95,7 +94,7 @@ xfs_chash_init(xfs_mount_t *mp)
 	mp->m_chsize = min_t(uint, mp->m_chsize, mp->m_ihsize);
 	mp->m_chash = (xfs_chash_t *)kmem_zalloc(mp->m_chsize
 						 * sizeof(xfs_chash_t),
-						 KM_SLEEP);
+						 KM_SLEEP | KM_LARGE);
 	for (i = 0; i < mp->m_chsize; i++) {
 		spinlock_init(&mp->m_chash[i].ch_lock,"xfshash");
 	}
@@ -244,7 +243,9 @@ again:
 
 				XFS_STATS_INC(xs_ig_found);
 
+				spin_lock(&ip->i_flags_lock);
 				ip->i_flags &= ~XFS_IRECLAIMABLE;
+				spin_unlock(&ip->i_flags_lock);
 				version = ih->ih_version;
 				read_unlock(&ih->ih_lock);
 				xfs_ihash_promote(ih, ip, version);
@@ -290,15 +291,17 @@ again:
 
 finish_inode:
 			if (ip->i_d.di_mode == 0) {
-				if (!(flags & IGET_CREATE))
+				if (!(flags & XFS_IGET_CREATE))
 					return ENOENT;
 				xfs_iocore_inode_reinit(ip);
 			}
-	
+
 			if (lock_flags != 0)
 				xfs_ilock(ip, lock_flags);
 
+			spin_lock(&ip->i_flags_lock);
 			ip->i_flags &= ~XFS_ISTALE;
+			spin_unlock(&ip->i_flags_lock);
 
 			vn_trace_exit(vp, "xfs_iget.found",
 						(inst_t *)__return_address);
@@ -320,21 +323,20 @@ finish_inode:
 	 * Read the disk inode attributes into a new inode structure and get
 	 * a new vnode for it. This should also initialize i_ino and i_mount.
 	 */
-	error = xfs_iread(mp, tp, ino, &ip, bno);
-	if (error) {
+	error = xfs_iread(mp, tp, ino, &ip, bno,
+			  (flags & XFS_IGET_BULKSTAT) ? XFS_IMAP_BULKSTAT : 0);
+	if (error)
 		return error;
-	}
 
 	vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address);
 
 	xfs_inode_lock_init(ip, vp);
 	xfs_iocore_inode_init(ip);
 
-	if (lock_flags != 0) {
+	if (lock_flags)
 		xfs_ilock(ip, lock_flags);
-	}
-		
-	if ((ip->i_d.di_mode == 0) && !(flags & IGET_CREATE)) {
+
+	if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
 		xfs_idestroy(ip);
 		return ENOENT;
 	}
@@ -369,7 +371,9 @@ finish_inode:
 	ih->ih_next = ip;
 	ip->i_udquot = ip->i_gdquot = NULL;
 	ih->ih_version++;
+	spin_lock(&ip->i_flags_lock);
 	ip->i_flags |= XFS_INEW;
+	spin_unlock(&ip->i_flags_lock);
 
 	write_unlock(&ih->ih_lock);
 
@@ -548,7 +552,7 @@ xfs_inode_lock_init(
 	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", vp->v_number);
 	init_waitqueue_head(&ip->i_ipin_wait);
 	atomic_set(&ip->i_pincount, 0);
-	init_sema(&ip->i_flock, 1, "xfsfino", vp->v_number);
+	initnsema(&ip->i_flock, 1, "xfsfino");
 }
 
 /*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1f8ecff8553a..c27d7d495aa0 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -854,7 +854,8 @@ xfs_iread(
 	xfs_trans_t	*tp,
 	xfs_ino_t	ino,
 	xfs_inode_t	**ipp,
-	xfs_daddr_t	bno)
+	xfs_daddr_t	bno,
+	uint		imap_flags)
 {
 	xfs_buf_t	*bp;
 	xfs_dinode_t	*dip;
@@ -866,6 +867,7 @@ xfs_iread(
 	ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP);
 	ip->i_ino = ino;
 	ip->i_mount = mp;
+	spin_lock_init(&ip->i_flags_lock);
 
 	/*
 	 * Get pointer's to the on-disk inode and the buffer containing it.
@@ -874,7 +876,7 @@ xfs_iread(
 	 * return NULL as well.  Set i_blkno to 0 so that xfs_itobp() will
 	 * know that this is a new incore inode.
 	 */
-	error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, 0);
+	error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags);
 	if (error) {
 		kmem_zone_free(xfs_inode_zone, ip);
 		return error;
@@ -1113,7 +1115,7 @@ xfs_ialloc(
 	 * to prevent others from looking at until we're done.
 	 */
 	error = xfs_trans_iget(tp->t_mountp, tp, ino,
-			IGET_CREATE, XFS_ILOCK_EXCL, &ip);
+				XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
 	if (error != 0) {
 		return error;
 	}
@@ -2213,7 +2215,9 @@ xfs_ifree_cluster(
 
 			if (ip == free_ip) {
 				if (xfs_iflock_nowait(ip)) {
+					spin_lock(&ip->i_flags_lock);
 					ip->i_flags |= XFS_ISTALE;
+					spin_unlock(&ip->i_flags_lock);
 
 					if (xfs_inode_clean(ip)) {
 						xfs_ifunlock(ip);
@@ -2227,7 +2231,9 @@ xfs_ifree_cluster(
 
 			if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
 				if (xfs_iflock_nowait(ip)) {
+					spin_lock(&ip->i_flags_lock);
 					ip->i_flags |= XFS_ISTALE;
+					spin_unlock(&ip->i_flags_lock);
 
 					if (xfs_inode_clean(ip)) {
 						xfs_ifunlock(ip);
@@ -2257,7 +2263,9 @@ xfs_ifree_cluster(
 				AIL_LOCK(mp,s);
 				iip->ili_flush_lsn = iip->ili_item.li_lsn;
 				AIL_UNLOCK(mp, s);
+				spin_lock(&iip->ili_inode->i_flags_lock);
 				iip->ili_inode->i_flags |= XFS_ISTALE;
+				spin_unlock(&iip->ili_inode->i_flags_lock);
 				pre_flushed++;
 			}
 			lip = lip->li_bio_list;
@@ -2753,19 +2761,29 @@ xfs_iunpin(
 		 * call as the inode reclaim may be blocked waiting for
 		 * the inode to become unpinned.
 		 */
+		struct inode *inode = NULL;
+
+		spin_lock(&ip->i_flags_lock);
 		if (!(ip->i_flags & (XFS_IRECLAIM|XFS_IRECLAIMABLE))) {
 			bhv_vnode_t	*vp = XFS_ITOV_NULL(ip);
 
 			/* make sync come back and flush this inode */
 			if (vp) {
-				struct inode	*inode = vn_to_inode(vp);
+				inode = vn_to_inode(vp);
 
 				if (!(inode->i_state &
-						(I_NEW|I_FREEING|I_CLEAR)))
-					mark_inode_dirty_sync(inode);
+						(I_NEW|I_FREEING|I_CLEAR))) {
+					inode = igrab(inode);
+					if (inode)
+						mark_inode_dirty_sync(inode);
+				} else
+					inode = NULL;
 			}
 		}
+		spin_unlock(&ip->i_flags_lock);
 		wake_up(&ip->i_ipin_wait);
+		if (inode)
+			iput(inode);
 	}
 }
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index d10b76ed1e5b..e96eb0835fe6 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -267,6 +267,7 @@ typedef struct xfs_inode {
 	sema_t			i_flock;	/* inode flush lock */
 	atomic_t		i_pincount;	/* inode pin count */
 	wait_queue_head_t	i_ipin_wait;	/* inode pinning wait queue */
+	spinlock_t		i_flags_lock;	/* inode i_flags lock */
 #ifdef HAVE_REFCACHE
 	struct xfs_inode	**i_refcache;	/* ptr to entry in ref cache */
 	struct xfs_inode	*i_release;	/* inode to unref */
@@ -389,11 +390,14 @@ typedef struct xfs_inode {
 	(((vfsp)->vfs_flag & VFS_GRPID) || ((pip)->i_d.di_mode & S_ISGID))
 
 /*
- * xfs_iget.c prototypes.
+ * Flags for xfs_iget()
  */
+#define XFS_IGET_CREATE		0x1
+#define XFS_IGET_BULKSTAT	0x2
 
-#define IGET_CREATE	1
-
+/*
+ * xfs_iget.c prototypes.
+ */
 void		xfs_ihash_init(struct xfs_mount *);
 void		xfs_ihash_free(struct xfs_mount *);
 void		xfs_chash_init(struct xfs_mount *);
@@ -425,7 +429,7 @@ int		xfs_itobp(struct xfs_mount *, struct xfs_trans *,
 			  xfs_inode_t *, xfs_dinode_t **, struct xfs_buf **,
 			  xfs_daddr_t, uint);
 int		xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
-			  xfs_inode_t **, xfs_daddr_t);
+			  xfs_inode_t **, xfs_daddr_t, uint);
 int		xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
 int		xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
 			   xfs_nlink_t, xfs_dev_t, struct cred *, xfs_prid_t,
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index f8e80d8e7237..a7a92251eb56 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -743,21 +743,6 @@ xfs_inode_item_committed(
 }
 
 /*
- * The transaction with the inode locked has aborted.  The inode
- * must not be dirty within the transaction (unless we're forcibly
- * shutting down).  We simply unlock just as if the transaction
- * had been cancelled.
- */
-STATIC void
-xfs_inode_item_abort(
-	xfs_inode_log_item_t	*iip)
-{
-	xfs_inode_item_unlock(iip);
-	return;
-}
-
-
-/*
  * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
  * failed to get the inode flush lock but did get the inode locked SHARED.
  * Here we're trying to see if the inode buffer is incore, and if so whether it's
@@ -915,7 +900,6 @@ STATIC struct xfs_item_ops xfs_inode_item_ops = {
 	.iop_committed	= (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_inode_item_committed,
 	.iop_push	= (void(*)(xfs_log_item_t*))xfs_inode_item_push,
-	.iop_abort	= (void(*)(xfs_log_item_t*))xfs_inode_item_abort,
 	.iop_pushbuf	= (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf,
 	.iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
 					xfs_inode_item_committing
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 5db6cd1b4cf3..bfe92ea17952 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -25,52 +25,54 @@
  * must be added on to the end.
  */
 typedef struct xfs_inode_log_format {
-	unsigned short		ilf_type;	/* inode log item type */
-	unsigned short		ilf_size;	/* size of this item */
-	uint			ilf_fields;	/* flags for fields logged */
-	ushort			ilf_asize;	/* size of attr d/ext/root */
-	ushort			ilf_dsize;	/* size of data/ext/root */
-	xfs_ino_t		ilf_ino;	/* inode number */
+	__uint16_t		ilf_type;	/* inode log item type */
+	__uint16_t		ilf_size;	/* size of this item */
+	__uint32_t		ilf_fields;	/* flags for fields logged */
+	__uint16_t		ilf_asize;	/* size of attr d/ext/root */
+	__uint16_t		ilf_dsize;	/* size of data/ext/root */
+	__uint64_t		ilf_ino;	/* inode number */
 	union {
-		xfs_dev_t	ilfu_rdev;	/* rdev value for dev inode*/
+		__uint32_t	ilfu_rdev;	/* rdev value for dev inode*/
 		uuid_t		ilfu_uuid;	/* mount point value */
 	} ilf_u;
 	__int64_t		ilf_blkno;	/* blkno of inode buffer */
-	int			ilf_len;	/* len of inode buffer */
-	int			ilf_boffset;	/* off of inode in buffer */
+	__int32_t		ilf_len;	/* len of inode buffer */
+	__int32_t		ilf_boffset;	/* off of inode in buffer */
 } xfs_inode_log_format_t;
 
+#ifndef HAVE_FORMAT32
 typedef struct xfs_inode_log_format_32 {
-	unsigned short		ilf_type;	/* 16: inode log item type */
-	unsigned short		ilf_size;	/* 16: size of this item */
-	uint			ilf_fields;	/* 32: flags for fields logged */
-	ushort			ilf_asize;	/* 32: size of attr d/ext/root */
-	ushort			ilf_dsize;	/* 32: size of data/ext/root */
-	xfs_ino_t		ilf_ino;	/* 64: inode number */
+	__uint16_t		ilf_type;	/* inode log item type */
+	__uint16_t		ilf_size;	/* size of this item */
+	__uint32_t		ilf_fields;	/* flags for fields logged */
+	__uint16_t		ilf_asize;	/* size of attr d/ext/root */
+	__uint16_t		ilf_dsize;	/* size of data/ext/root */
+	__uint64_t		ilf_ino;	/* inode number */
 	union {
-		xfs_dev_t	ilfu_rdev;	/* 32: rdev value for dev inode*/
-		uuid_t		ilfu_uuid;	/* 128: mount point value */
+		__uint32_t	ilfu_rdev;	/* rdev value for dev inode*/
+		uuid_t		ilfu_uuid;	/* mount point value */
 	} ilf_u;
-	__int64_t		ilf_blkno;	/* 64: blkno of inode buffer */
-	int			ilf_len;	/* 32: len of inode buffer */
-	int			ilf_boffset;	/* 32: off of inode in buffer */
+	__int64_t		ilf_blkno;	/* blkno of inode buffer */
+	__int32_t		ilf_len;	/* len of inode buffer */
+	__int32_t		ilf_boffset;	/* off of inode in buffer */
 } __attribute__((packed)) xfs_inode_log_format_32_t;
+#endif
 
 typedef struct xfs_inode_log_format_64 {
-	unsigned short		ilf_type;	/* 16: inode log item type */
-	unsigned short		ilf_size;	/* 16: size of this item */
-	uint			ilf_fields;	/* 32: flags for fields logged */
-	ushort			ilf_asize;	/* 32: size of attr d/ext/root */
-	ushort			ilf_dsize;	/* 32: size of data/ext/root */
-	__uint32_t		ilf_pad;	/* 32: pad for 64 bit boundary */
-	xfs_ino_t		ilf_ino;	/* 64: inode number */
+	__uint16_t		ilf_type;	/* inode log item type */
+	__uint16_t		ilf_size;	/* size of this item */
+	__uint32_t		ilf_fields;	/* flags for fields logged */
+	__uint16_t		ilf_asize;	/* size of attr d/ext/root */
+	__uint16_t		ilf_dsize;	/* size of data/ext/root */
+	__uint32_t		ilf_pad;	/* pad for 64 bit boundary */
+	__uint64_t		ilf_ino;	/* inode number */
 	union {
-		xfs_dev_t	ilfu_rdev;	/* 32: rdev value for dev inode*/
-		uuid_t		ilfu_uuid;	/* 128: mount point value */
+		__uint32_t	ilfu_rdev;	/* rdev value for dev inode*/
+		uuid_t		ilfu_uuid;	/* mount point value */
 	} ilf_u;
-	__int64_t		ilf_blkno;	/* 64: blkno of inode buffer */
-	int			ilf_len;	/* 32: len of inode buffer */
-	int			ilf_boffset;	/* 32: off of inode in buffer */
+	__int64_t		ilf_blkno;	/* blkno of inode buffer */
+	__int32_t		ilf_len;	/* len of inode buffer */
+	__int32_t		ilf_boffset;	/* off of inode in buffer */
 } xfs_inode_log_format_64_t;
 
 /*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index f1949c16df15..19655124da78 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -398,6 +398,23 @@ xfs_flush_space(
 	return 1;
 }
 
+STATIC int
+xfs_cmn_err_fsblock_zero(
+	xfs_inode_t	*ip,
+	xfs_bmbt_irec_t	*imap)
+{
+	xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount,
+			"Access to block zero in inode %llu "
+			"start_block: %llx start_off: %llx "
+			"blkcnt: %llx extent-state: %x\n",
+		(unsigned long long)ip->i_ino,
+		(unsigned long long)imap->br_startblock,
+		(unsigned long long)imap->br_startoff,
+		(unsigned long long)imap->br_blockcount,
+		imap->br_state);
+	return EFSCORRUPTED;
+}
+
 int
 xfs_iomap_write_direct(
 	xfs_inode_t	*ip,
@@ -536,23 +553,17 @@ xfs_iomap_write_direct(
 	 * Copy any maps to caller's array and return any error.
 	 */
 	if (nimaps == 0) {
-		error = (ENOSPC);
+		error = ENOSPC;
+		goto error_out;
+	}
+
+	if (unlikely(!imap.br_startblock && !(io->io_flags & XFS_IOCORE_RT))) {
+		error = xfs_cmn_err_fsblock_zero(ip, &imap);
 		goto error_out;
 	}
 
 	*ret_imap = imap;
 	*nmaps = 1;
-	if ( !(io->io_flags & XFS_IOCORE_RT)  && !ret_imap->br_startblock) {
-                cmn_err(CE_PANIC,"Access to block zero:  fs <%s> inode: %lld "
-                        "start_block : %llx start_off : %llx blkcnt : %llx "
-                        "extent-state : %x \n",
-                        (ip->i_mount)->m_fsname,
-                        (long long)ip->i_ino,
-                        (unsigned long long)ret_imap->br_startblock,
-			(unsigned long long)ret_imap->br_startoff,
-                        (unsigned long long)ret_imap->br_blockcount,
-			ret_imap->br_state);
-        }
 	return 0;
 
 error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
@@ -715,17 +726,8 @@ retry:
 		goto retry;
 	}
 
-	if (!(io->io_flags & XFS_IOCORE_RT)  && !ret_imap->br_startblock) {
-		cmn_err(CE_PANIC,"Access to block zero:  fs <%s> inode: %lld "
-                        "start_block : %llx start_off : %llx blkcnt : %llx "
-                        "extent-state : %x \n",
-                        (ip->i_mount)->m_fsname,
-                        (long long)ip->i_ino,
-                        (unsigned long long)ret_imap->br_startblock,
-			(unsigned long long)ret_imap->br_startoff,
-                        (unsigned long long)ret_imap->br_blockcount,
-			ret_imap->br_state);
-	}
+	if (unlikely(!imap[0].br_startblock && !(io->io_flags & XFS_IOCORE_RT)))
+		return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
 
 	*ret_imap = imap[0];
 	*nmaps = 1;
@@ -853,24 +855,10 @@ xfs_iomap_write_allocate(
 		 * See if we were able to allocate an extent that
 		 * covers at least part of the callers request
 		 */
-
 		for (i = 0; i < nimaps; i++) {
-			if (!(io->io_flags & XFS_IOCORE_RT)  &&
-			    !imap[i].br_startblock) {
-				cmn_err(CE_PANIC,"Access to block zero:  "
-					"fs <%s> inode: %lld "
-					"start_block : %llx start_off : %llx "
-					"blkcnt : %llx extent-state : %x \n",
-					(ip->i_mount)->m_fsname,
-					(long long)ip->i_ino,
-					(unsigned long long)
-						imap[i].br_startblock,
-					(unsigned long long)
-						imap[i].br_startoff,
-					(unsigned long long)
-				        	imap[i].br_blockcount,
-					imap[i].br_state);
-                        }
+			if (unlikely(!imap[i].br_startblock &&
+				     !(io->io_flags & XFS_IOCORE_RT)))
+				return xfs_cmn_err_fsblock_zero(ip, &imap[i]);
 			if ((offset_fsb >= imap[i].br_startoff) &&
 			    (offset_fsb < (imap[i].br_startoff +
 					   imap[i].br_blockcount))) {
@@ -941,7 +929,7 @@ xfs_iomap_write_unwritten(
 				XFS_WRITE_LOG_COUNT);
 		if (error) {
 			xfs_trans_cancel(tp, 0);
-			goto error0;
+			return XFS_ERROR(error);
 		}
 
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -967,19 +955,11 @@ xfs_iomap_write_unwritten(
 		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		if (error)
-			goto error0;
-
-		if ( !(io->io_flags & XFS_IOCORE_RT)  && !imap.br_startblock) {
-			cmn_err(CE_PANIC,"Access to block zero:  fs <%s> "
-				"inode: %lld start_block : %llx start_off : "
-				"%llx blkcnt : %llx extent-state : %x \n",
-				(ip->i_mount)->m_fsname,
-				(long long)ip->i_ino,
-				(unsigned long long)imap.br_startblock,
-				(unsigned long long)imap.br_startoff,
-				(unsigned long long)imap.br_blockcount,
-				imap.br_state);
-        	}
+			return XFS_ERROR(error);
+
+		if (unlikely(!imap.br_startblock &&
+			     !(io->io_flags & XFS_IOCORE_RT)))
+			return xfs_cmn_err_fsblock_zero(ip, &imap);
 
 		if ((numblks_fsb = imap.br_blockcount) == 0) {
 			/*
@@ -999,6 +979,5 @@ error_on_bmapi_transaction:
 	xfs_bmap_cancel(&free_list);
 	xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-error0:
 	return XFS_ERROR(error);
 }
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 46249e4d1fea..7775ddc0b3c6 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -39,6 +39,16 @@
 #include "xfs_error.h"
 #include "xfs_btree.h"
 
+int
+xfs_internal_inum(
+	xfs_mount_t	*mp,
+	xfs_ino_t	ino)
+{
+	return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
+		(XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
+		 (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino)));
+}
+
 STATIC int
 xfs_bulkstat_one_iget(
 	xfs_mount_t	*mp,		/* mount point for filesystem */
@@ -52,7 +62,8 @@ xfs_bulkstat_one_iget(
 	bhv_vnode_t	*vp;
 	int		error;
 
-	error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, bno);
+	error = xfs_iget(mp, NULL, ino,
+			 XFS_IGET_BULKSTAT, XFS_ILOCK_SHARED, &ip, bno);
 	if (error) {
 		*stat = BULKSTAT_RV_NOTHING;
 		return error;
@@ -212,17 +223,12 @@ xfs_bulkstat_one(
 	xfs_dinode_t	*dip;		/* dinode inode pointer */
 
 	dip = (xfs_dinode_t *)dibuff;
+	*stat = BULKSTAT_RV_NOTHING;
 
-	if (!buffer || ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino ||
-	    (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) &&
-	     (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))) {
-		*stat = BULKSTAT_RV_NOTHING;
+	if (!buffer || xfs_internal_inum(mp, ino))
 		return XFS_ERROR(EINVAL);
-	}
-	if (ubsize < sizeof(*buf)) {
-		*stat = BULKSTAT_RV_NOTHING;
+	if (ubsize < sizeof(*buf))
 		return XFS_ERROR(ENOMEM);
-	}
 
 	buf = kmem_alloc(sizeof(*buf), KM_SLEEP);
 
@@ -238,8 +244,7 @@ xfs_bulkstat_one(
 	}
 
 	if (copy_to_user(buffer, buf, sizeof(*buf)))  {
-		*stat = BULKSTAT_RV_NOTHING;
-		error =  EFAULT;
+		error = EFAULT;
 		goto out_free;
 	}
 
@@ -253,6 +258,46 @@ xfs_bulkstat_one(
 }
 
 /*
+ * Test to see whether we can use the ondisk inode directly, based
+ * on the given bulkstat flags, filling in dipp accordingly.
+ * Returns zero if the inode is dodgey.
+ */
+STATIC int
+xfs_bulkstat_use_dinode(
+	xfs_mount_t	*mp,
+	int		flags,
+	xfs_buf_t	*bp,
+	int		clustidx,
+	xfs_dinode_t	**dipp)
+{
+	xfs_dinode_t	*dip;
+	unsigned int	aformat;
+
+	*dipp = NULL;
+	if (!bp || (flags & BULKSTAT_FG_IGET))
+		return 1;
+	dip = (xfs_dinode_t *)
+			xfs_buf_offset(bp, clustidx << mp->m_sb.sb_inodelog);
+	if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC ||
+	    !XFS_DINODE_GOOD_VERSION(
+			INT_GET(dip->di_core.di_version, ARCH_CONVERT)))
+		return 0;
+	if (flags & BULKSTAT_FG_QUICK) {
+		*dipp = dip;
+		return 1;
+	}
+	/* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */
+	aformat = INT_GET(dip->di_core.di_aformat, ARCH_CONVERT);
+	if ((XFS_CFORK_Q(&dip->di_core) == 0) ||
+	    (aformat == XFS_DINODE_FMT_LOCAL) ||
+	    (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_core.di_anextents)) {
+		*dipp = dip;
+		return 1;
+	}
+	return 1;
+}
+
+/*
  * Return stat information in bulk (by-inode) for the filesystem.
  */
 int					/* error status */
@@ -284,10 +329,11 @@ xfs_bulkstat(
 	xfs_agino_t		gino;	/* current btree rec's start inode */
 	int			i;	/* loop index */
 	int			icount;	/* count of inodes good in irbuf */
+	size_t			irbsize; /* size of irec buffer in bytes */
 	xfs_ino_t		ino;	/* inode number (filesystem) */
-	xfs_inobt_rec_t		*irbp;	/* current irec buffer pointer */
-	xfs_inobt_rec_t		*irbuf;	/* start of irec buffer */
-	xfs_inobt_rec_t		*irbufend; /* end of good irec buffer entries */
+	xfs_inobt_rec_incore_t	*irbp;	/* current irec buffer pointer */
+	xfs_inobt_rec_incore_t	*irbuf;	/* start of irec buffer */
+	xfs_inobt_rec_incore_t	*irbufend; /* end of good irec buffer entries */
 	xfs_ino_t		lastino=0; /* last inode number returned */
 	int			nbcluster; /* # of blocks in a cluster */
 	int			nicluster; /* # of inodes in a cluster */
@@ -328,13 +374,10 @@ xfs_bulkstat(
 		(XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
 	nimask = ~(nicluster - 1);
 	nbcluster = nicluster >> mp->m_sb.sb_inopblog;
-	/*
-	 * Allocate a page-sized buffer for inode btree records.
-	 * We could try allocating something smaller, but for normal
-	 * calls we'll always (potentially) need the whole page.
-	 */
-	irbuf = kmem_alloc(NBPC, KM_SLEEP);
-	nirbuf = NBPC / sizeof(*irbuf);
+	irbuf = kmem_zalloc_greedy(&irbsize, NBPC, NBPC * 4,
+				   KM_SLEEP | KM_MAYFAIL | KM_LARGE);
+	nirbuf = irbsize / sizeof(*irbuf);
+
 	/*
 	 * Loop over the allocation groups, starting from the last
 	 * inode returned; 0 means start of the allocation group.
@@ -358,7 +401,7 @@ xfs_bulkstat(
 		 * Allocate and initialize a btree cursor for ialloc btree.
 		 */
 		cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_INO,
-			(xfs_inode_t *)0, 0);
+						(xfs_inode_t *)0, 0);
 		irbp = irbuf;
 		irbufend = irbuf + nirbuf;
 		end_of_ag = 0;
@@ -395,9 +438,9 @@ xfs_bulkstat(
 						gcnt++;
 				}
 				gfree |= XFS_INOBT_MASKN(0, chunkidx);
-				INT_SET(irbp->ir_startino, ARCH_CONVERT, gino);
-				INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt);
-				INT_SET(irbp->ir_free, ARCH_CONVERT, gfree);
+				irbp->ir_startino = gino;
+				irbp->ir_freecount = gcnt;
+				irbp->ir_free = gfree;
 				irbp++;
 				agino = gino + XFS_INODES_PER_CHUNK;
 				icount = XFS_INODES_PER_CHUNK - gcnt;
@@ -451,11 +494,27 @@ xfs_bulkstat(
 			}
 			/*
 			 * If this chunk has any allocated inodes, save it.
+			 * Also start read-ahead now for this chunk.
 			 */
 			if (gcnt < XFS_INODES_PER_CHUNK) {
-				INT_SET(irbp->ir_startino, ARCH_CONVERT, gino);
-				INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt);
-				INT_SET(irbp->ir_free, ARCH_CONVERT, gfree);
+				/*
+				 * Loop over all clusters in the next chunk.
+				 * Do a readahead if there are any allocated
+				 * inodes in that cluster.
+				 */
+				for (agbno = XFS_AGINO_TO_AGBNO(mp, gino),
+				     chunkidx = 0;
+				     chunkidx < XFS_INODES_PER_CHUNK;
+				     chunkidx += nicluster,
+				     agbno += nbcluster) {
+					if (XFS_INOBT_MASKN(chunkidx,
+							    nicluster) & ~gfree)
+						xfs_btree_reada_bufs(mp, agno,
+							agbno, nbcluster);
+				}
+				irbp->ir_startino = gino;
+				irbp->ir_freecount = gcnt;
+				irbp->ir_free = gfree;
 				irbp++;
 				icount += XFS_INODES_PER_CHUNK - gcnt;
 			}
@@ -479,33 +538,11 @@ xfs_bulkstat(
 		for (irbp = irbuf;
 		     irbp < irbufend && ubleft >= statstruct_size; irbp++) {
 			/*
-			 * Read-ahead the next chunk's worth of inodes.
-			 */
-			if (&irbp[1] < irbufend) {
-				/*
-				 * Loop over all clusters in the next chunk.
-				 * Do a readahead if there are any allocated
-				 * inodes in that cluster.
-				 */
-				for (agbno = XFS_AGINO_TO_AGBNO(mp,
-							INT_GET(irbp[1].ir_startino, ARCH_CONVERT)),
-				     chunkidx = 0;
-				     chunkidx < XFS_INODES_PER_CHUNK;
-				     chunkidx += nicluster,
-				     agbno += nbcluster) {
-					if (XFS_INOBT_MASKN(chunkidx,
-							    nicluster) &
-					    ~(INT_GET(irbp[1].ir_free, ARCH_CONVERT)))
-						xfs_btree_reada_bufs(mp, agno,
-							agbno, nbcluster);
-				}
-			}
-			/*
 			 * Now process this chunk of inodes.
 			 */
-			for (agino = INT_GET(irbp->ir_startino, ARCH_CONVERT), chunkidx = 0, clustidx = 0;
+			for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
 			     ubleft > 0 &&
-				INT_GET(irbp->ir_freecount, ARCH_CONVERT) < XFS_INODES_PER_CHUNK;
+				irbp->ir_freecount < XFS_INODES_PER_CHUNK;
 			     chunkidx++, clustidx++, agino++) {
 				ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
 				/*
@@ -525,11 +562,12 @@ xfs_bulkstat(
 				 */
 				if ((chunkidx & (nicluster - 1)) == 0) {
 					agbno = XFS_AGINO_TO_AGBNO(mp,
-							INT_GET(irbp->ir_startino, ARCH_CONVERT)) +
+							irbp->ir_startino) +
 						((chunkidx & nimask) >>
 						 mp->m_sb.sb_inopblog);
 
-					if (flags & BULKSTAT_FG_QUICK) {
+					if (flags & (BULKSTAT_FG_QUICK |
+						     BULKSTAT_FG_INLINE)) {
 						ino = XFS_AGINO_TO_INO(mp, agno,
 								       agino);
 						bno = XFS_AGB_TO_DADDR(mp, agno,
@@ -543,6 +581,7 @@ xfs_bulkstat(
 								      KM_SLEEP);
 						ip->i_ino = ino;
 						ip->i_mount = mp;
+						spin_lock_init(&ip->i_flags_lock);
 						if (bp)
 							xfs_buf_relse(bp);
 						error = xfs_itobp(mp, NULL, ip,
@@ -564,30 +603,34 @@ xfs_bulkstat(
 				/*
 				 * Skip if this inode is free.
 				 */
-				if (XFS_INOBT_MASK(chunkidx) & INT_GET(irbp->ir_free, ARCH_CONVERT))
+				if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free)
 					continue;
 				/*
 				 * Count used inodes as free so we can tell
 				 * when the chunk is used up.
 				 */
-				INT_MOD(irbp->ir_freecount, ARCH_CONVERT, +1);
+				irbp->ir_freecount++;
 				ino = XFS_AGINO_TO_INO(mp, agno, agino);
 				bno = XFS_AGB_TO_DADDR(mp, agno, agbno);
-				if (flags & BULKSTAT_FG_QUICK) {
-					dip = (xfs_dinode_t *)xfs_buf_offset(bp,
-					      (clustidx << mp->m_sb.sb_inodelog));
-
-					if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT)
-						    != XFS_DINODE_MAGIC
-					    || !XFS_DINODE_GOOD_VERSION(
-						    INT_GET(dip->di_core.di_version, ARCH_CONVERT)))
-						continue;
+				if (!xfs_bulkstat_use_dinode(mp, flags, bp,
+							     clustidx, &dip))
+					continue;
+				/*
+				 * If we need to do an iget, cannot hold bp.
+				 * Drop it, until starting the next cluster.
+				 */
+				if ((flags & BULKSTAT_FG_INLINE) && !dip) {
+					if (bp)
+						xfs_buf_relse(bp);
+					bp = NULL;
 				}
 
 				/*
 				 * Get the inode and fill in a single buffer.
 				 * BULKSTAT_FG_QUICK uses dip to fill it in.
 				 * BULKSTAT_FG_IGET uses igets.
+				 * BULKSTAT_FG_INLINE uses dip if we have an
+				 * inline attr fork, else igets.
 				 * See: xfs_bulkstat_one & xfs_dm_bulkstat_one.
 				 * This is also used to count inodes/blks, etc
 				 * in xfs_qm_quotacheck.
@@ -597,8 +640,15 @@ xfs_bulkstat(
 						ubleft, private_data,
 						bno, &ubused, dip, &fmterror);
 				if (fmterror == BULKSTAT_RV_NOTHING) {
-					if (error == ENOMEM)
+                                        if (error == EFAULT) {
+                                                ubleft = 0;
+                                                rval = error;
+                                                break;
+                                        }
+					else if (error == ENOMEM)
 						ubleft = 0;
+					else
+						lastino = ino;
 					continue;
 				}
 				if (fmterror == BULKSTAT_RV_GIVEUP) {
@@ -633,7 +683,7 @@ xfs_bulkstat(
 	/*
 	 * Done, we're either out of filesystem or space to put the data.
 	 */
-	kmem_free(irbuf, NBPC);
+	kmem_free(irbuf, irbsize);
 	*ubcountp = ubelem;
 	if (agno >= mp->m_sb.sb_agcount) {
 		/*
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index be5f12e07d22..f25a28862a17 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -36,15 +36,16 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount	*mp,
 /*
  * Values for stat return value.
  */
-#define	BULKSTAT_RV_NOTHING	0
-#define	BULKSTAT_RV_DIDONE	1
-#define	BULKSTAT_RV_GIVEUP	2
+#define BULKSTAT_RV_NOTHING	0
+#define BULKSTAT_RV_DIDONE	1
+#define BULKSTAT_RV_GIVEUP	2
 
 /*
  * Values for bulkstat flag argument.
  */
-#define	BULKSTAT_FG_IGET	0x1	/* Go through the buffer cache */
-#define	BULKSTAT_FG_QUICK	0x2	/* No iget, walk the dinode cluster */
+#define BULKSTAT_FG_IGET	0x1	/* Go through the buffer cache */
+#define BULKSTAT_FG_QUICK	0x2	/* No iget, walk the dinode cluster */
+#define BULKSTAT_FG_INLINE	0x4	/* No iget if inline attrs */
 
 /*
  * Return stat information in bulk (by-inode) for the filesystem.
@@ -80,6 +81,11 @@ xfs_bulkstat_one(
 	void			*dibuff,
 	int			*stat);
 
+int
+xfs_internal_inum(
+	xfs_mount_t		*mp,
+	xfs_ino_t		ino);
+
 int					/* error status */
 xfs_inumbers(
 	xfs_mount_t		*mp,	/* mount point for filesystem */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 21ac1a67e3e0..c48bf61f17bd 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -617,7 +617,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 		reg[0].i_len  = sizeof(magic);
 		XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_UNMOUNT);
 
-		error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0, 0);
+		error = xfs_log_reserve(mp, 600, 1, &tic,
+					XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE);
 		if (!error) {
 			/* remove inited flag */
 			((xlog_ticket_t *)tic)->t_flags = 0;
@@ -655,8 +656,11 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 		} else {
 			LOG_UNLOCK(log, s);
 		}
-		if (tic)
+		if (tic) {
+			xlog_trace_loggrant(log, tic, "unmount rec");
+			xlog_ungrant_log_space(log, tic);
 			xlog_state_put_ticket(log, tic);
+		}
 	} else {
 		/*
 		 * We're already in forced_shutdown mode, couldn't
@@ -1196,7 +1200,7 @@ xlog_alloc_log(xfs_mount_t	*mp,
 			  kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP);
 		iclog = *iclogp;
 		iclog->hic_data = (xlog_in_core_2_t *)
-			  kmem_zalloc(iclogsize, KM_SLEEP);
+			  kmem_zalloc(iclogsize, KM_SLEEP | KM_LARGE);
 
 		iclog->ic_prev = prev_iclog;
 		prev_iclog = iclog;
@@ -2212,9 +2216,13 @@ xlog_state_do_callback(
 
 			iclog = iclog->ic_next;
 		} while (first_iclog != iclog);
-		if (repeats && (repeats % 10) == 0) {
+
+		if (repeats > 5000) {
+			flushcnt += repeats;
+			repeats = 0;
 			xfs_fs_cmn_err(CE_WARN, log->l_mp,
-				"xlog_state_do_callback: looping %d", repeats);
+				"%s: possible infinite loop (%d iterations)",
+				__FUNCTION__, flushcnt);
 		}
 	} while (!ioerrors && loopdidcallbacks);
 
@@ -2246,6 +2254,7 @@ xlog_state_do_callback(
 	}
 #endif
 
+	flushcnt = 0;
 	if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) {
 		flushcnt = log->l_flushcnt;
 		log->l_flushcnt = 0;
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index eacb3d4987f2..ebbe93f4f97b 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -48,16 +48,10 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
  */
 
 /*
- * Flags to xfs_log_mount
- */
-#define XFS_LOG_RECOVER		0x1
-
-/*
  * Flags to xfs_log_done()
  */
 #define XFS_LOG_REL_PERM_RESERV	0x1
 
-
 /*
  * Flags to xfs_log_reserve()
  *
@@ -70,8 +64,6 @@ static inline xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 #define XFS_LOG_SLEEP		0x0
 #define XFS_LOG_NOSLEEP		0x1
 #define XFS_LOG_PERM_RESERV	0x2
-#define XFS_LOG_RESV_ALL	(XFS_LOG_NOSLEEP|XFS_LOG_PERM_RESERV)
-
 
 /*
  * Flags to xfs_log_force()
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 34bcbf50789c..9bd3cdf11a87 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -32,7 +32,6 @@ struct xfs_mount;
 #define XLOG_MIN_ICLOGS		2
 #define XLOG_MED_ICLOGS		4
 #define XLOG_MAX_ICLOGS		8
-#define XLOG_CALLBACK_SIZE	10
 #define XLOG_HEADER_MAGIC_NUM	0xFEEDbabe	/* Invalid cycle number */
 #define XLOG_VERSION_1		1
 #define XLOG_VERSION_2		2		/* Large IClogs, Log sunit */
@@ -149,9 +148,6 @@ struct xfs_mount;
 #define XLOG_WAS_CONT_TRANS	0x08	/* Cont this trans into new region */
 #define XLOG_END_TRANS		0x10	/* End a continued transaction */
 #define XLOG_UNMOUNT_TRANS	0x20	/* Unmount a filesystem transaction */
-#define XLOG_SKIP_TRANS		(XLOG_COMMIT_TRANS | XLOG_CONTINUE_TRANS | \
-				 XLOG_WAS_CONT_TRANS | XLOG_END_TRANS | \
-				 XLOG_UNMOUNT_TRANS)
 
 #ifdef __KERNEL__
 /*
@@ -506,6 +502,12 @@ extern int	 xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *);
 #define XLOG_TRACE_SLEEP_FLUSH 3
 #define XLOG_TRACE_WAKE_FLUSH  4
 
+/*
+ * Unmount record type is used as a pseudo transaction type for the ticket.
+ * It's value must be outside the range of XFS_TRANS_* values.
+ */
+#define XLOG_UNMOUNT_REC_TYPE	(-1U)
+
 #endif	/* __KERNEL__ */
 
 #endif	/* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b2bd4be4200a..e5f396ff9a3d 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -331,7 +331,7 @@ typedef struct xfs_mount {
 	xfs_agnumber_t		m_agirotor;	/* last ag dir inode alloced */
 	lock_t			m_agirotor_lock;/* .. and lock protecting it */
 	xfs_agnumber_t		m_maxagi;	/* highest inode alloc group */
-	uint			m_ihsize;	/* size of next field */
+	size_t			m_ihsize;	/* size of next field */
 	struct xfs_ihash	*m_ihash;	/* fs private inode hash table*/
 	struct xfs_inode	*m_inodes;	/* active inode list */
 	struct list_head	m_del_inodes;	/* inodes to reclaim */
@@ -541,7 +541,8 @@ static inline xfs_mount_t *xfs_bhvtom(bhv_desc_t *bdp)
 #define XFS_VFSTOM(vfs) xfs_vfstom(vfs)
 static inline xfs_mount_t *xfs_vfstom(bhv_vfs_t *vfs)
 {
-	return XFS_BHVTOM(bhv_lookup(VFS_BHVHEAD(vfs), &xfs_vfsops));
+	return XFS_BHVTOM(bhv_lookup_range(VFS_BHVHEAD(vfs),
+				VFS_POSITION_XFS, VFS_POSITION_XFS));
 }
 
 #define XFS_DADDR_TO_AGNO(mp,d)         xfs_daddr_to_agno(mp,d)
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index acb853b33ebb..9dcb32aa4e2e 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -281,8 +281,6 @@ typedef struct xfs_qoff_logformat {
 				 XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\
 				 XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\
 				 XFS_GQUOTA_ACCT)
-#define XFS_MOUNT_QUOTA_MASK	(XFS_MOUNT_QUOTA_ALL | XFS_UQUOTA_ACTIVE | \
-				 XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
 
 
 /*
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 5a0b678956e0..880c73271c05 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1948,7 +1948,7 @@ xfs_growfs_rt(
 	 */
 	nrextents = nrblocks;
 	do_div(nrextents, in->extsize);
-	nrbmblocks = roundup_64(nrextents, NBBY * sbp->sb_blocksize);
+	nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize);
 	nrextslog = xfs_highbit32(nrextents);
 	nrsumlevels = nrextslog + 1;
 	nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks;
@@ -1976,7 +1976,10 @@ xfs_growfs_rt(
 	if ((error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks,
 			mp->m_sb.sb_rsumino)))
 		return error;
-	nmp = NULL;
+	/*
+	 * Allocate a new (fake) mount/sb.
+	 */
+	nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP);
 	/*
 	 * Loop over the bitmap blocks.
 	 * We will do everything one bitmap block at a time.
@@ -1987,10 +1990,6 @@ xfs_growfs_rt(
 		     ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
 	     bmbno < nrbmblocks;
 	     bmbno++) {
-		/*
-		 * Allocate a new (fake) mount/sb.
-		 */
-		nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP);
 		*nmp = *mp;
 		nsbp = &nmp->m_sb;
 		/*
@@ -2018,13 +2017,13 @@ xfs_growfs_rt(
 		cancelflags = 0;
 		if ((error = xfs_trans_reserve(tp, 0,
 				XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0)))
-			goto error_exit;
+			break;
 		/*
 		 * Lock out other callers by grabbing the bitmap inode lock.
 		 */
 		if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
 						XFS_ILOCK_EXCL, &ip)))
-			goto error_exit;
+			break;
 		ASSERT(ip == mp->m_rbmip);
 		/*
 		 * Update the bitmap inode's size.
@@ -2038,7 +2037,7 @@ xfs_growfs_rt(
 		 */
 		if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0,
 						XFS_ILOCK_EXCL, &ip)))
-			goto error_exit;
+			break;
 		ASSERT(ip == mp->m_rsumip);
 		/*
 		 * Update the summary inode's size.
@@ -2053,7 +2052,7 @@ xfs_growfs_rt(
 		    mp->m_rsumlevels != nmp->m_rsumlevels) {
 			error = xfs_rtcopy_summary(mp, nmp, tp);
 			if (error)
-				goto error_exit;
+				break;
 		}
 		/*
 		 * Update superblock fields.
@@ -2080,18 +2079,13 @@ xfs_growfs_rt(
 		error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
 			nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
 		if (error)
-			goto error_exit;
+			break;
 		/*
 		 * Mark more blocks free in the superblock.
 		 */
 		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS,
 			nsbp->sb_rextents - sbp->sb_rextents);
 		/*
-		 * Free the fake mp structure.
-		 */
-		kmem_free(nmp, sizeof(*nmp));
-		nmp = NULL;
-		/*
 		 * Update mp values into the real mp structure.
 		 */
 		mp->m_rsumlevels = nrsumlevels;
@@ -2101,15 +2095,15 @@ xfs_growfs_rt(
 		 */
 		xfs_trans_commit(tp, 0, NULL);
 	}
-	return 0;
+
+	if (error)
+		xfs_trans_cancel(tp, cancelflags);
 
 	/*
-	 * Error paths come here.
+	 * Free the fake mp structure.
 	 */
-error_exit:
-	if (nmp)
-		kmem_free(nmp, sizeof(*nmp));
-	xfs_trans_cancel(tp, cancelflags);
+	kmem_free(nmp, sizeof(*nmp));
+
 	return error;
 }
 
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index bf168a91ddb8..467854b45c8f 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -60,10 +60,6 @@ struct xfs_mount;
 	 XFS_SB_VERSION_LOGV2BIT | \
 	 XFS_SB_VERSION_SECTORBIT | \
 	 XFS_SB_VERSION_MOREBITSBIT)
-#define	XFS_SB_VERSION_OKSASHBITS	\
-	(XFS_SB_VERSION_NUMBITS | \
-	 XFS_SB_VERSION_REALFBITS | \
-	 XFS_SB_VERSION_OKSASHFBITS)
 #define	XFS_SB_VERSION_OKREALBITS	\
 	(XFS_SB_VERSION_NUMBITS | \
 	 XFS_SB_VERSION_OKREALFBITS | \
@@ -81,9 +77,6 @@ struct xfs_mount;
 #define XFS_SB_VERSION2_RESERVED2BIT	0x00000002
 #define XFS_SB_VERSION2_RESERVED4BIT	0x00000004
 #define XFS_SB_VERSION2_ATTR2BIT	0x00000008	/* Inline attr rework */
-#define XFS_SB_VERSION2_SASHFBITS	0xff000000	/* Mask: features that
-							   require changing
-							   PROM and SASH */
 
 #define	XFS_SB_VERSION2_OKREALFBITS	\
 	(XFS_SB_VERSION2_ATTR2BIT)
@@ -238,12 +231,6 @@ static inline int xfs_sb_good_version(xfs_sb_t *sbp)
 }
 #endif /* __KERNEL__ */
 
-#define	XFS_SB_GOOD_SASH_VERSION(sbp)	\
-	((((sbp)->sb_versionnum >= XFS_SB_VERSION_1) && \
-	  ((sbp)->sb_versionnum <= XFS_SB_VERSION_3)) || \
-	 ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
-	  !((sbp)->sb_versionnum & ~XFS_SB_VERSION_OKSASHBITS)))
-
 #define	XFS_SB_VERSION_TONEW(v)	xfs_sb_version_tonew(v)
 static inline unsigned xfs_sb_version_tonew(unsigned v)
 {
@@ -461,15 +448,6 @@ static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp)
  * File system sector to basic block conversions.
  */
 #define XFS_FSS_TO_BB(mp,sec)	((sec) << (mp)->m_sectbb_log)
-#define XFS_BB_TO_FSS(mp,bb)	\
-	(((bb) + (XFS_FSS_TO_BB(mp,1) - 1)) >> (mp)->m_sectbb_log)
-#define XFS_BB_TO_FSST(mp,bb)	((bb) >> (mp)->m_sectbb_log)
-
-/*
- * File system sector to byte conversions.
- */
-#define XFS_FSS_TO_B(mp,sectno)	((xfs_fsize_t)(sectno) << (mp)->m_sb.sb_sectlog)
-#define XFS_B_TO_FSST(mp,b)	(((__uint64_t)(b)) >> (mp)->m_sb.sb_sectlog)
 
 /*
  * File system block to basic block conversions.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 9dc88b380608..c68e00105d23 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -149,7 +149,6 @@ typedef struct xfs_item_ops {
 	void (*iop_unlock)(xfs_log_item_t *);
 	xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
 	void (*iop_push)(xfs_log_item_t *);
-	void (*iop_abort)(xfs_log_item_t *);
 	void (*iop_pushbuf)(xfs_log_item_t *);
 	void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
 } xfs_item_ops_t;
@@ -163,7 +162,6 @@ typedef struct xfs_item_ops {
 #define IOP_UNLOCK(ip)		(*(ip)->li_ops->iop_unlock)(ip)
 #define IOP_COMMITTED(ip, lsn)	(*(ip)->li_ops->iop_committed)(ip, lsn)
 #define IOP_PUSH(ip)		(*(ip)->li_ops->iop_push)(ip)
-#define IOP_ABORT(ip)		(*(ip)->li_ops->iop_abort)(ip)
 #define IOP_PUSHBUF(ip)		(*(ip)->li_ops->iop_pushbuf)(ip)
 #define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
 
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 558c87ff0c41..fc39b166d403 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -276,7 +276,7 @@ xfs_trans_update_ail(
 	xfs_mount_t	*mp,
 	xfs_log_item_t	*lip,
 	xfs_lsn_t	lsn,
-	unsigned long	s)
+	unsigned long	s) __releases(mp->m_ail_lock)
 {
 	xfs_ail_entry_t		*ailp;
 	xfs_log_item_t		*dlip=NULL;
@@ -328,7 +328,7 @@ void
 xfs_trans_delete_ail(
 	xfs_mount_t	*mp,
 	xfs_log_item_t	*lip,
-	unsigned long	s)
+	unsigned long	s) __releases(mp->m_ail_lock)
 {
 	xfs_ail_entry_t		*ailp;
 	xfs_log_item_t		*dlip;
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 13edab8a9e94..447ac4308c91 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -46,11 +46,13 @@ xfs_log_busy_slot_t		*xfs_trans_add_busy(xfs_trans_t *tp,
 /*
  * From xfs_trans_ail.c
  */
-void			xfs_trans_update_ail(struct xfs_mount *,
-				     struct xfs_log_item *, xfs_lsn_t,
-				     unsigned long);
-void			xfs_trans_delete_ail(struct xfs_mount *,
-				     struct xfs_log_item *, unsigned long);
+void			xfs_trans_update_ail(struct xfs_mount *mp,
+				     struct xfs_log_item *lip, xfs_lsn_t lsn,
+				     unsigned long s)
+				     __releases(mp->m_ail_lock);
+void			xfs_trans_delete_ail(struct xfs_mount *mp,
+				     struct xfs_log_item *lip, unsigned long s)
+				     __releases(mp->m_ail_lock);
 struct xfs_log_item	*xfs_trans_first_ail(struct xfs_mount *, int *);
 struct xfs_log_item	*xfs_trans_next_ail(struct xfs_mount *,
 				     struct xfs_log_item *, int *, int *);
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index a34796e57afb..62336a4cc5a4 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -1922,7 +1922,7 @@ xfs_showargs(
 	}
 
 	if (mp->m_flags & XFS_MOUNT_IHASHSIZE)
-		seq_printf(m, "," MNTOPT_IHASHSIZE "=%d", mp->m_ihsize);
+		seq_printf(m, "," MNTOPT_IHASHSIZE "=%d", (int)mp->m_ihsize);
 
 	if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
 		seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk",
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 23cfa5837728..061e2ffdd1de 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2366,10 +2366,15 @@ xfs_remove(
 
 	namelen = VNAMELEN(dentry);
 
+	if (!xfs_get_dir_entry(dentry, &ip)) {
+	        dm_di_mode = ip->i_d.di_mode;
+		IRELE(ip);
+	}
+
 	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
 					DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
-					name, NULL, 0, 0, 0);
+					name, NULL, dm_di_mode, 0, 0);
 		if (error)
 			return error;
 	}
@@ -2995,7 +3000,7 @@ xfs_rmdir(
 	int			cancel_flags;
 	int			committed;
 	bhv_vnode_t		*dir_vp;
-	int			dm_di_mode = 0;
+	int			dm_di_mode = S_IFDIR;
 	int			last_cdp_link;
 	int			namelen;
 	uint			resblks;
@@ -3010,11 +3015,16 @@ xfs_rmdir(
 		return XFS_ERROR(EIO);
 	namelen = VNAMELEN(dentry);
 
+	if (!xfs_get_dir_entry(dentry, &cdp)) {
+	        dm_di_mode = cdp->i_d.di_mode;
+		IRELE(cdp);
+	}
+
 	if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
 		error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
 					dir_vp, DM_RIGHT_NULL,
 					NULL, DM_RIGHT_NULL,
-					name, NULL, 0, 0, 0);
+					name, NULL, dm_di_mode, 0, 0);
 		if (error)
 			return XFS_ERROR(error);
 	}
@@ -3834,7 +3844,9 @@ xfs_reclaim(
 		XFS_MOUNT_ILOCK(mp);
 		vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
 		list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
+		spin_lock(&ip->i_flags_lock);
 		ip->i_flags |= XFS_IRECLAIMABLE;
+		spin_unlock(&ip->i_flags_lock);
 		XFS_MOUNT_IUNLOCK(mp);
 	}
 	return 0;
@@ -3859,8 +3871,10 @@ xfs_finish_reclaim(
 	 * us.
 	 */
 	write_lock(&ih->ih_lock);
+	spin_lock(&ip->i_flags_lock);
 	if ((ip->i_flags & XFS_IRECLAIM) ||
 	    (!(ip->i_flags & XFS_IRECLAIMABLE) && vp == NULL)) {
+		spin_unlock(&ip->i_flags_lock);
 		write_unlock(&ih->ih_lock);
 		if (locked) {
 			xfs_ifunlock(ip);
@@ -3869,6 +3883,7 @@ xfs_finish_reclaim(
 		return 1;
 	}
 	ip->i_flags |= XFS_IRECLAIM;
+	spin_unlock(&ip->i_flags_lock);
 	write_unlock(&ih->ih_lock);
 
 	/*
@@ -4272,7 +4287,7 @@ xfs_free_file_space(
 	xfs_mount_t		*mp;
 	int			nimap;
 	uint			resblks;
-	int			rounding;
+	uint			rounding;
 	int			rt;
 	xfs_fileoff_t		startoffset_fsb;
 	xfs_trans_t		*tp;
@@ -4313,8 +4328,7 @@ xfs_free_file_space(
 		vn_iowait(vp);	/* wait for the completion of any pending DIOs */
 	}
 
-	rounding = MAX((__uint8_t)(1 << mp->m_sb.sb_blocklog),
-			(__uint8_t)NBPP);
+	rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, NBPP);
 	ilen = len + (offset & (rounding - 1));
 	ioffset = offset & ~(rounding - 1);
 	if (ilen & (rounding - 1))